1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS %s
3; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s
4; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
5; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
6; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
7
8declare i32 @llvm.amdgcn.workitem.id.x()
9
10@local_var32 = addrspace(3) global i32 undef, align 4
11@local_var64 = addrspace(3) global i64 undef, align 8
12
13; Show what the atomic optimization pass will do for local pointers.
14
15define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) {
16;
17;
18; GFX7LESS-LABEL: add_i32_constant:
19; GFX7LESS:       ; %bb.0: ; %entry
20; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
21; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
22; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
23; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
24; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
25; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
26; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
27; GFX7LESS-NEXT:    s_cbranch_execz BB0_2
28; GFX7LESS-NEXT:  ; %bb.1:
29; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
30; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
31; GFX7LESS-NEXT:    v_mul_u32_u24_e64 v2, s2, 5
32; GFX7LESS-NEXT:    s_mov_b32 m0, -1
33; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
34; GFX7LESS-NEXT:    ds_add_rtn_u32 v1, v1, v2
35; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
36; GFX7LESS-NEXT:    buffer_wbinvl1
37; GFX7LESS-NEXT:  BB0_2:
38; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
39; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v1
40; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
41; GFX7LESS-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
42; GFX7LESS-NEXT:    s_mov_b32 s2, -1
43; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
44; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
45; GFX7LESS-NEXT:    s_endpgm
46;
47; GFX8-LABEL: add_i32_constant:
48; GFX8:       ; %bb.0: ; %entry
49; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
50; GFX8-NEXT:    s_mov_b64 s[2:3], exec
51; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
52; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
53; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
54; GFX8-NEXT:    ; implicit-def: $vgpr1
55; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
56; GFX8-NEXT:    s_cbranch_execz BB0_2
57; GFX8-NEXT:  ; %bb.1:
58; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
59; GFX8-NEXT:    v_mul_u32_u24_e64 v1, s2, 5
60; GFX8-NEXT:    v_mov_b32_e32 v2, local_var32@abs32@lo
61; GFX8-NEXT:    s_mov_b32 m0, -1
62; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
63; GFX8-NEXT:    ds_add_rtn_u32 v1, v2, v1
64; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
65; GFX8-NEXT:    buffer_wbinvl1_vol
66; GFX8-NEXT:  BB0_2:
67; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
68; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
69; GFX8-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
70; GFX8-NEXT:    s_mov_b32 s3, 0xf000
71; GFX8-NEXT:    s_mov_b32 s2, -1
72; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
73; GFX8-NEXT:    s_nop 0
74; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
75; GFX8-NEXT:    s_endpgm
76;
77; GFX9-LABEL: add_i32_constant:
78; GFX9:       ; %bb.0: ; %entry
79; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
80; GFX9-NEXT:    s_mov_b64 s[2:3], exec
81; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
82; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
83; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
84; GFX9-NEXT:    ; implicit-def: $vgpr1
85; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
86; GFX9-NEXT:    s_cbranch_execz BB0_2
87; GFX9-NEXT:  ; %bb.1:
88; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
89; GFX9-NEXT:    v_mul_u32_u24_e64 v1, s2, 5
90; GFX9-NEXT:    v_mov_b32_e32 v2, local_var32@abs32@lo
91; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
92; GFX9-NEXT:    ds_add_rtn_u32 v1, v2, v1
93; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
94; GFX9-NEXT:    buffer_wbinvl1_vol
95; GFX9-NEXT:  BB0_2:
96; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
97; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
98; GFX9-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
99; GFX9-NEXT:    s_mov_b32 s3, 0xf000
100; GFX9-NEXT:    s_mov_b32 s2, -1
101; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
102; GFX9-NEXT:    s_nop 0
103; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
104; GFX9-NEXT:    s_endpgm
105;
106; GFX1064-LABEL: add_i32_constant:
107; GFX1064:       ; %bb.0: ; %entry
108; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
109; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
110; GFX1064-NEXT:    ; implicit-def: $vgpr1
111; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
112; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s3, v0
113; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
114; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
115; GFX1064-NEXT:    s_cbranch_execz BB0_2
116; GFX1064-NEXT:  ; %bb.1:
117; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
118; GFX1064-NEXT:    v_mov_b32_e32 v2, local_var32@abs32@lo
119; GFX1064-NEXT:    v_mul_u32_u24_e64 v1, s2, 5
120; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
121; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
122; GFX1064-NEXT:    ds_add_rtn_u32 v1, v2, v1
123; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
124; GFX1064-NEXT:    buffer_gl0_inv
125; GFX1064-NEXT:    buffer_gl1_inv
126; GFX1064-NEXT:  BB0_2:
127; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
128; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
129; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
130; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
131; GFX1064-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
132; GFX1064-NEXT:    s_mov_b32 s2, -1
133; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
134; GFX1064-NEXT:    s_nop 0
135; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
136; GFX1064-NEXT:    s_endpgm
137;
138; GFX1032-LABEL: add_i32_constant:
139; GFX1032:       ; %bb.0: ; %entry
140; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
141; GFX1032-NEXT:    s_mov_b32 s2, exec_lo
142; GFX1032-NEXT:    ; implicit-def: $vcc_hi
143; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
144; GFX1032-NEXT:    ; implicit-def: $vgpr1
145; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
146; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
147; GFX1032-NEXT:    s_cbranch_execz BB0_2
148; GFX1032-NEXT:  ; %bb.1:
149; GFX1032-NEXT:    s_bcnt1_i32_b32 s2, s2
150; GFX1032-NEXT:    v_mov_b32_e32 v2, local_var32@abs32@lo
151; GFX1032-NEXT:    v_mul_u32_u24_e64 v1, s2, 5
152; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
153; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
154; GFX1032-NEXT:    ds_add_rtn_u32 v1, v2, v1
155; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
156; GFX1032-NEXT:    buffer_gl0_inv
157; GFX1032-NEXT:    buffer_gl1_inv
158; GFX1032-NEXT:  BB0_2:
159; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
160; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
161; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
162; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
163; GFX1032-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
164; GFX1032-NEXT:    s_mov_b32 s2, -1
165; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
166; GFX1032-NEXT:    s_nop 0
167; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
168; GFX1032-NEXT:    s_endpgm
169entry:
170  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 5 acq_rel
171  store i32 %old, i32 addrspace(1)* %out
172  ret void
173}
174
175define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive) {
176;
177;
178; GFX7LESS-LABEL: add_i32_uniform:
179; GFX7LESS:       ; %bb.0: ; %entry
180; GFX7LESS-NEXT:    s_mov_b64 s[6:7], exec
181; GFX7LESS-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
182; GFX7LESS-NEXT:    s_load_dword s2, s[0:1], 0xb
183; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
184; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s7, v0
185; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
186; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
187; GFX7LESS-NEXT:    s_and_saveexec_b64 s[0:1], vcc
188; GFX7LESS-NEXT:    s_cbranch_execz BB1_2
189; GFX7LESS-NEXT:  ; %bb.1:
190; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s3, s[6:7]
191; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
192; GFX7LESS-NEXT:    s_mul_i32 s3, s2, s3
193; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
194; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s3
195; GFX7LESS-NEXT:    s_mov_b32 m0, -1
196; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
197; GFX7LESS-NEXT:    ds_add_rtn_u32 v1, v1, v2
198; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
199; GFX7LESS-NEXT:    buffer_wbinvl1
200; GFX7LESS-NEXT:  BB1_2:
201; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[0:1]
202; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v1
203; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
204; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s2, v0
205; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
206; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
207; GFX7LESS-NEXT:    s_mov_b32 s6, -1
208; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[4:7], 0
209; GFX7LESS-NEXT:    s_endpgm
210;
211; GFX8-LABEL: add_i32_uniform:
212; GFX8:       ; %bb.0: ; %entry
213; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
214; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x2c
215; GFX8-NEXT:    s_mov_b64 s[2:3], exec
216; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
217; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
218; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
219; GFX8-NEXT:    ; implicit-def: $vgpr1
220; GFX8-NEXT:    s_and_saveexec_b64 s[6:7], vcc
221; GFX8-NEXT:    s_cbranch_execz BB1_2
222; GFX8-NEXT:  ; %bb.1:
223; GFX8-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
224; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
225; GFX8-NEXT:    s_mul_i32 s1, s0, s1
226; GFX8-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
227; GFX8-NEXT:    v_mov_b32_e32 v2, s1
228; GFX8-NEXT:    s_mov_b32 m0, -1
229; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
230; GFX8-NEXT:    ds_add_rtn_u32 v1, v1, v2
231; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
232; GFX8-NEXT:    buffer_wbinvl1_vol
233; GFX8-NEXT:  BB1_2:
234; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
235; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
236; GFX8-NEXT:    v_mul_lo_u32 v0, s0, v0
237; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
238; GFX8-NEXT:    s_mov_b32 s7, 0xf000
239; GFX8-NEXT:    s_mov_b32 s6, -1
240; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
241; GFX8-NEXT:    buffer_store_dword v0, off, s[4:7], 0
242; GFX8-NEXT:    s_endpgm
243;
244; GFX9-LABEL: add_i32_uniform:
245; GFX9:       ; %bb.0: ; %entry
246; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
247; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x2c
248; GFX9-NEXT:    s_mov_b64 s[2:3], exec
249; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
250; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
251; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
252; GFX9-NEXT:    ; implicit-def: $vgpr1
253; GFX9-NEXT:    s_and_saveexec_b64 s[6:7], vcc
254; GFX9-NEXT:    s_cbranch_execz BB1_2
255; GFX9-NEXT:  ; %bb.1:
256; GFX9-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
257; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
258; GFX9-NEXT:    s_mul_i32 s1, s0, s1
259; GFX9-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
260; GFX9-NEXT:    v_mov_b32_e32 v2, s1
261; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
262; GFX9-NEXT:    ds_add_rtn_u32 v1, v1, v2
263; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
264; GFX9-NEXT:    buffer_wbinvl1_vol
265; GFX9-NEXT:  BB1_2:
266; GFX9-NEXT:    s_or_b64 exec, exec, s[6:7]
267; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
268; GFX9-NEXT:    v_mul_lo_u32 v0, s0, v0
269; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
270; GFX9-NEXT:    s_mov_b32 s7, 0xf000
271; GFX9-NEXT:    s_mov_b32 s6, -1
272; GFX9-NEXT:    v_add_u32_e32 v0, s0, v0
273; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
274; GFX9-NEXT:    s_endpgm
275;
276; GFX1064-LABEL: add_i32_uniform:
277; GFX1064:       ; %bb.0: ; %entry
278; GFX1064-NEXT:    s_clause 0x1
279; GFX1064-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
280; GFX1064-NEXT:    s_load_dword s0, s[0:1], 0x2c
281; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
282; GFX1064-NEXT:    ; implicit-def: $vgpr1
283; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
284; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s3, v0
285; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
286; GFX1064-NEXT:    s_and_saveexec_b64 s[6:7], vcc
287; GFX1064-NEXT:    s_cbranch_execz BB1_2
288; GFX1064-NEXT:  ; %bb.1:
289; GFX1064-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
290; GFX1064-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
291; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
292; GFX1064-NEXT:    s_mul_i32 s1, s0, s1
293; GFX1064-NEXT:    v_mov_b32_e32 v2, s1
294; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
295; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
296; GFX1064-NEXT:    ds_add_rtn_u32 v1, v1, v2
297; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
298; GFX1064-NEXT:    buffer_gl0_inv
299; GFX1064-NEXT:    buffer_gl1_inv
300; GFX1064-NEXT:  BB1_2:
301; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
302; GFX1064-NEXT:    s_or_b64 exec, exec, s[6:7]
303; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
304; GFX1064-NEXT:    v_mul_lo_u32 v0, s0, v0
305; GFX1064-NEXT:    v_readfirstlane_b32 s0, v1
306; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
307; GFX1064-NEXT:    s_mov_b32 s6, -1
308; GFX1064-NEXT:    v_add_nc_u32_e32 v0, s0, v0
309; GFX1064-NEXT:    buffer_store_dword v0, off, s[4:7], 0
310; GFX1064-NEXT:    s_endpgm
311;
312; GFX1032-LABEL: add_i32_uniform:
313; GFX1032:       ; %bb.0: ; %entry
314; GFX1032-NEXT:    s_clause 0x1
315; GFX1032-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
316; GFX1032-NEXT:    s_load_dword s0, s[0:1], 0x2c
317; GFX1032-NEXT:    s_mov_b32 s2, exec_lo
318; GFX1032-NEXT:    ; implicit-def: $vcc_hi
319; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
320; GFX1032-NEXT:    ; implicit-def: $vgpr1
321; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
322; GFX1032-NEXT:    s_and_saveexec_b32 s1, vcc_lo
323; GFX1032-NEXT:    s_cbranch_execz BB1_2
324; GFX1032-NEXT:  ; %bb.1:
325; GFX1032-NEXT:    s_bcnt1_i32_b32 s2, s2
326; GFX1032-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
327; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
328; GFX1032-NEXT:    s_mul_i32 s2, s0, s2
329; GFX1032-NEXT:    v_mov_b32_e32 v2, s2
330; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
331; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
332; GFX1032-NEXT:    ds_add_rtn_u32 v1, v1, v2
333; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
334; GFX1032-NEXT:    buffer_gl0_inv
335; GFX1032-NEXT:    buffer_gl1_inv
336; GFX1032-NEXT:  BB1_2:
337; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
338; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s1
339; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
340; GFX1032-NEXT:    v_mul_lo_u32 v0, s0, v0
341; GFX1032-NEXT:    v_readfirstlane_b32 s0, v1
342; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
343; GFX1032-NEXT:    s_mov_b32 s6, -1
344; GFX1032-NEXT:    v_add_nc_u32_e32 v0, s0, v0
345; GFX1032-NEXT:    buffer_store_dword v0, off, s[4:7], 0
346; GFX1032-NEXT:    s_endpgm
347entry:
348  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %additive acq_rel
349  store i32 %old, i32 addrspace(1)* %out
350  ret void
351}
352
353define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
354;
355;
356; GFX7LESS-LABEL: add_i32_varying:
357; GFX7LESS:       ; %bb.0: ; %entry
358; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
359; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
360; GFX7LESS-NEXT:    s_mov_b32 m0, -1
361; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
362; GFX7LESS-NEXT:    ds_add_rtn_u32 v0, v1, v0
363; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
364; GFX7LESS-NEXT:    buffer_wbinvl1
365; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
366; GFX7LESS-NEXT:    s_mov_b32 s2, -1
367; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
368; GFX7LESS-NEXT:    s_endpgm
369;
370; GFX8-LABEL: add_i32_varying:
371; GFX8:       ; %bb.0: ; %entry
372; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
373; GFX8-NEXT:    v_mov_b32_e32 v2, v0
374; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
375; GFX8-NEXT:    v_mov_b32_e32 v1, 0
376; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
377; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
378; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
379; GFX8-NEXT:    s_not_b64 exec, exec
380; GFX8-NEXT:    v_mov_b32_e32 v2, 0
381; GFX8-NEXT:    s_not_b64 exec, exec
382; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
383; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
384; GFX8-NEXT:    s_nop 1
385; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
386; GFX8-NEXT:    s_nop 1
387; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
388; GFX8-NEXT:    s_nop 1
389; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
390; GFX8-NEXT:    s_nop 1
391; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
392; GFX8-NEXT:    s_nop 1
393; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
394; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
395; GFX8-NEXT:    s_nop 0
396; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
397; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
398; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
399; GFX8-NEXT:    ; implicit-def: $vgpr0
400; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
401; GFX8-NEXT:    s_cbranch_execz BB2_2
402; GFX8-NEXT:  ; %bb.1:
403; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
404; GFX8-NEXT:    v_mov_b32_e32 v3, s4
405; GFX8-NEXT:    s_mov_b32 m0, -1
406; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
407; GFX8-NEXT:    ds_add_rtn_u32 v0, v0, v3
408; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
409; GFX8-NEXT:    buffer_wbinvl1_vol
410; GFX8-NEXT:  BB2_2:
411; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
412; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
413; GFX8-NEXT:    v_mov_b32_e32 v0, v1
414; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
415; GFX8-NEXT:    s_mov_b32 s3, 0xf000
416; GFX8-NEXT:    s_mov_b32 s2, -1
417; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
418; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
419; GFX8-NEXT:    s_endpgm
420;
421; GFX9-LABEL: add_i32_varying:
422; GFX9:       ; %bb.0: ; %entry
423; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
424; GFX9-NEXT:    v_mov_b32_e32 v2, v0
425; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
426; GFX9-NEXT:    v_mov_b32_e32 v1, 0
427; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
428; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
429; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
430; GFX9-NEXT:    s_not_b64 exec, exec
431; GFX9-NEXT:    v_mov_b32_e32 v2, 0
432; GFX9-NEXT:    s_not_b64 exec, exec
433; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
434; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
435; GFX9-NEXT:    s_nop 1
436; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
437; GFX9-NEXT:    s_nop 1
438; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
439; GFX9-NEXT:    s_nop 1
440; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
441; GFX9-NEXT:    s_nop 1
442; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
443; GFX9-NEXT:    s_nop 1
444; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
445; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
446; GFX9-NEXT:    s_nop 0
447; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
448; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
449; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
450; GFX9-NEXT:    ; implicit-def: $vgpr0
451; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
452; GFX9-NEXT:    s_cbranch_execz BB2_2
453; GFX9-NEXT:  ; %bb.1:
454; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
455; GFX9-NEXT:    v_mov_b32_e32 v3, s4
456; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
457; GFX9-NEXT:    ds_add_rtn_u32 v0, v0, v3
458; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
459; GFX9-NEXT:    buffer_wbinvl1_vol
460; GFX9-NEXT:  BB2_2:
461; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
462; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
463; GFX9-NEXT:    v_mov_b32_e32 v0, v1
464; GFX9-NEXT:    v_add_u32_e32 v0, s2, v0
465; GFX9-NEXT:    s_mov_b32 s3, 0xf000
466; GFX9-NEXT:    s_mov_b32 s2, -1
467; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
468; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
469; GFX9-NEXT:    s_endpgm
470;
471; GFX1064-LABEL: add_i32_varying:
472; GFX1064:       ; %bb.0: ; %entry
473; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
474; GFX1064-NEXT:    s_not_b64 exec, exec
475; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
476; GFX1064-NEXT:    s_not_b64 exec, exec
477; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
478; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
479; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
480; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
481; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
482; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
483; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
484; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
485; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
486; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
487; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
488; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
489; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
490; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
491; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
492; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
493; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
494; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
495; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
496; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
497; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
498; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
499; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
500; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
501; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
502; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
503; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
504; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
505; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
506; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
507; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
508; GFX1064-NEXT:    s_mov_b32 s2, -1
509; GFX1064-NEXT:    ; implicit-def: $vgpr0
510; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
511; GFX1064-NEXT:    s_cbranch_execz BB2_2
512; GFX1064-NEXT:  ; %bb.1:
513; GFX1064-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
514; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
515; GFX1064-NEXT:    s_mov_b32 s3, s7
516; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
517; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
518; GFX1064-NEXT:    ds_add_rtn_u32 v0, v7, v4
519; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
520; GFX1064-NEXT:    buffer_gl0_inv
521; GFX1064-NEXT:    buffer_gl1_inv
522; GFX1064-NEXT:  BB2_2:
523; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
524; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
525; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
526; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
527; GFX1064-NEXT:    v_add_nc_u32_e32 v0, s3, v0
528; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
529; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
530; GFX1064-NEXT:    s_nop 0
531; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
532; GFX1064-NEXT:    s_endpgm
533;
534; GFX1032-LABEL: add_i32_varying:
535; GFX1032:       ; %bb.0: ; %entry
536; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
537; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
538; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
539; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
540; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
541; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
542; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
543; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
544; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
545; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
546; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
547; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
548; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
549; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
550; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
551; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
552; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
553; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
554; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
555; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
556; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
557; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
558; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
559; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
560; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
561; GFX1032-NEXT:    s_mov_b32 s2, -1
562; GFX1032-NEXT:    ; implicit-def: $vgpr0
563; GFX1032-NEXT:    ; implicit-def: $vcc_hi
564; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
565; GFX1032-NEXT:    s_cbranch_execz BB2_2
566; GFX1032-NEXT:  ; %bb.1:
567; GFX1032-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
568; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
569; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
570; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
571; GFX1032-NEXT:    ds_add_rtn_u32 v0, v7, v4
572; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
573; GFX1032-NEXT:    buffer_gl0_inv
574; GFX1032-NEXT:    buffer_gl1_inv
575; GFX1032-NEXT:  BB2_2:
576; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
577; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
578; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
579; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
580; GFX1032-NEXT:    v_add_nc_u32_e32 v0, s3, v0
581; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
582; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
583; GFX1032-NEXT:    s_nop 0
584; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
585; GFX1032-NEXT:    s_endpgm
586entry:
587  %lane = call i32 @llvm.amdgcn.workitem.id.x()
588  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel
589  store i32 %old, i32 addrspace(1)* %out
590  ret void
591}
592
593define amdgpu_kernel void @add_i32_varying_gfx1032(i32 addrspace(1)* %out) {
594;
595;
596; GFX7LESS-LABEL: add_i32_varying_gfx1032:
597; GFX7LESS:       ; %bb.0: ; %entry
598; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
599; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
600; GFX7LESS-NEXT:    s_mov_b32 m0, -1
601; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
602; GFX7LESS-NEXT:    ds_add_rtn_u32 v0, v1, v0
603; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
604; GFX7LESS-NEXT:    buffer_wbinvl1
605; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
606; GFX7LESS-NEXT:    s_mov_b32 s2, -1
607; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
608; GFX7LESS-NEXT:    s_endpgm
609;
610; GFX8-LABEL: add_i32_varying_gfx1032:
611; GFX8:       ; %bb.0: ; %entry
612; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
613; GFX8-NEXT:    v_mov_b32_e32 v2, v0
614; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
615; GFX8-NEXT:    v_mov_b32_e32 v1, 0
616; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
617; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
618; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
619; GFX8-NEXT:    s_not_b64 exec, exec
620; GFX8-NEXT:    v_mov_b32_e32 v2, 0
621; GFX8-NEXT:    s_not_b64 exec, exec
622; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
623; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
624; GFX8-NEXT:    s_nop 1
625; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
626; GFX8-NEXT:    s_nop 1
627; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
628; GFX8-NEXT:    s_nop 1
629; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
630; GFX8-NEXT:    s_nop 1
631; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
632; GFX8-NEXT:    s_nop 1
633; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
634; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
635; GFX8-NEXT:    s_nop 0
636; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
637; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
638; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
639; GFX8-NEXT:    ; implicit-def: $vgpr0
640; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
641; GFX8-NEXT:    s_cbranch_execz BB3_2
642; GFX8-NEXT:  ; %bb.1:
643; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
644; GFX8-NEXT:    v_mov_b32_e32 v3, s4
645; GFX8-NEXT:    s_mov_b32 m0, -1
646; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
647; GFX8-NEXT:    ds_add_rtn_u32 v0, v0, v3
648; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
649; GFX8-NEXT:    buffer_wbinvl1_vol
650; GFX8-NEXT:  BB3_2:
651; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
652; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
653; GFX8-NEXT:    v_mov_b32_e32 v0, v1
654; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
655; GFX8-NEXT:    s_mov_b32 s3, 0xf000
656; GFX8-NEXT:    s_mov_b32 s2, -1
657; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
658; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
659; GFX8-NEXT:    s_endpgm
660;
661; GFX9-LABEL: add_i32_varying_gfx1032:
662; GFX9:       ; %bb.0: ; %entry
663; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
664; GFX9-NEXT:    v_mov_b32_e32 v2, v0
665; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
666; GFX9-NEXT:    v_mov_b32_e32 v1, 0
667; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
668; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
669; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
670; GFX9-NEXT:    s_not_b64 exec, exec
671; GFX9-NEXT:    v_mov_b32_e32 v2, 0
672; GFX9-NEXT:    s_not_b64 exec, exec
673; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
674; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
675; GFX9-NEXT:    s_nop 1
676; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
677; GFX9-NEXT:    s_nop 1
678; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
679; GFX9-NEXT:    s_nop 1
680; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
681; GFX9-NEXT:    s_nop 1
682; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
683; GFX9-NEXT:    s_nop 1
684; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
685; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
686; GFX9-NEXT:    s_nop 0
687; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
688; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
689; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
690; GFX9-NEXT:    ; implicit-def: $vgpr0
691; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
692; GFX9-NEXT:    s_cbranch_execz BB3_2
693; GFX9-NEXT:  ; %bb.1:
694; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
695; GFX9-NEXT:    v_mov_b32_e32 v3, s4
696; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
697; GFX9-NEXT:    ds_add_rtn_u32 v0, v0, v3
698; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
699; GFX9-NEXT:    buffer_wbinvl1_vol
700; GFX9-NEXT:  BB3_2:
701; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
702; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
703; GFX9-NEXT:    v_mov_b32_e32 v0, v1
704; GFX9-NEXT:    v_add_u32_e32 v0, s2, v0
705; GFX9-NEXT:    s_mov_b32 s3, 0xf000
706; GFX9-NEXT:    s_mov_b32 s2, -1
707; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
708; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
709; GFX9-NEXT:    s_endpgm
710;
711; GFX1064-LABEL: add_i32_varying_gfx1032:
712; GFX1064:       ; %bb.0: ; %entry
713; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
714; GFX1064-NEXT:    s_not_b64 exec, exec
715; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
716; GFX1064-NEXT:    s_not_b64 exec, exec
717; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
718; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
719; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
720; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
721; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
722; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
723; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
724; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
725; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
726; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
727; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
728; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
729; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
730; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
731; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
732; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
733; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
734; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
735; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
736; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
737; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
738; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
739; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
740; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
741; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
742; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
743; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
744; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
745; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
746; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
747; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
748; GFX1064-NEXT:    s_mov_b32 s2, -1
749; GFX1064-NEXT:    ; implicit-def: $vgpr0
750; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
751; GFX1064-NEXT:    s_cbranch_execz BB3_2
752; GFX1064-NEXT:  ; %bb.1:
753; GFX1064-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
754; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
755; GFX1064-NEXT:    s_mov_b32 s3, s7
756; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
757; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
758; GFX1064-NEXT:    ds_add_rtn_u32 v0, v7, v4
759; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
760; GFX1064-NEXT:    buffer_gl0_inv
761; GFX1064-NEXT:    buffer_gl1_inv
762; GFX1064-NEXT:  BB3_2:
763; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
764; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
765; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
766; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
767; GFX1064-NEXT:    v_add_nc_u32_e32 v0, s3, v0
768; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
769; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
770; GFX1064-NEXT:    s_nop 0
771; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
772; GFX1064-NEXT:    s_endpgm
773;
774; GFX1032-LABEL: add_i32_varying_gfx1032:
775; GFX1032:       ; %bb.0: ; %entry
776; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
777; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
778; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
779; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
780; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
781; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
782; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
783; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
784; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
785; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
786; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
787; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
788; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
789; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
790; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
791; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
792; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
793; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
794; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
795; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
796; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
797; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
798; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
799; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
800; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
801; GFX1032-NEXT:    s_mov_b32 s2, -1
802; GFX1032-NEXT:    ; implicit-def: $vgpr0
803; GFX1032-NEXT:    ; implicit-def: $vcc_hi
804; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
805; GFX1032-NEXT:    s_cbranch_execz BB3_2
806; GFX1032-NEXT:  ; %bb.1:
807; GFX1032-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
808; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
809; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
810; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
811; GFX1032-NEXT:    ds_add_rtn_u32 v0, v7, v4
812; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
813; GFX1032-NEXT:    buffer_gl0_inv
814; GFX1032-NEXT:    buffer_gl1_inv
815; GFX1032-NEXT:  BB3_2:
816; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
817; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
818; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
819; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
820; GFX1032-NEXT:    v_add_nc_u32_e32 v0, s3, v0
821; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
822; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
823; GFX1032-NEXT:    s_nop 0
824; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
825; GFX1032-NEXT:    s_endpgm
826entry:
827  %lane = call i32 @llvm.amdgcn.workitem.id.x()
828  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel
829  store i32 %old, i32 addrspace(1)* %out
830  ret void
831}
832
833define amdgpu_kernel void @add_i32_varying_gfx1064(i32 addrspace(1)* %out) {
834;
835;
836; GFX7LESS-LABEL: add_i32_varying_gfx1064:
837; GFX7LESS:       ; %bb.0: ; %entry
838; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
839; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
840; GFX7LESS-NEXT:    s_mov_b32 m0, -1
841; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
842; GFX7LESS-NEXT:    ds_add_rtn_u32 v0, v1, v0
843; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
844; GFX7LESS-NEXT:    buffer_wbinvl1
845; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
846; GFX7LESS-NEXT:    s_mov_b32 s2, -1
847; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
848; GFX7LESS-NEXT:    s_endpgm
849;
850; GFX8-LABEL: add_i32_varying_gfx1064:
851; GFX8:       ; %bb.0: ; %entry
852; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
853; GFX8-NEXT:    v_mov_b32_e32 v2, v0
854; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
855; GFX8-NEXT:    v_mov_b32_e32 v1, 0
856; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
857; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
858; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
859; GFX8-NEXT:    s_not_b64 exec, exec
860; GFX8-NEXT:    v_mov_b32_e32 v2, 0
861; GFX8-NEXT:    s_not_b64 exec, exec
862; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
863; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
864; GFX8-NEXT:    s_nop 1
865; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
866; GFX8-NEXT:    s_nop 1
867; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
868; GFX8-NEXT:    s_nop 1
869; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
870; GFX8-NEXT:    s_nop 1
871; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
872; GFX8-NEXT:    s_nop 1
873; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
874; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
875; GFX8-NEXT:    s_nop 0
876; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
877; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
878; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
879; GFX8-NEXT:    ; implicit-def: $vgpr0
880; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
881; GFX8-NEXT:    s_cbranch_execz BB4_2
882; GFX8-NEXT:  ; %bb.1:
883; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
884; GFX8-NEXT:    v_mov_b32_e32 v3, s4
885; GFX8-NEXT:    s_mov_b32 m0, -1
886; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
887; GFX8-NEXT:    ds_add_rtn_u32 v0, v0, v3
888; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
889; GFX8-NEXT:    buffer_wbinvl1_vol
890; GFX8-NEXT:  BB4_2:
891; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
892; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
893; GFX8-NEXT:    v_mov_b32_e32 v0, v1
894; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
895; GFX8-NEXT:    s_mov_b32 s3, 0xf000
896; GFX8-NEXT:    s_mov_b32 s2, -1
897; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
898; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
899; GFX8-NEXT:    s_endpgm
900;
901; GFX9-LABEL: add_i32_varying_gfx1064:
902; GFX9:       ; %bb.0: ; %entry
903; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
904; GFX9-NEXT:    v_mov_b32_e32 v2, v0
905; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
906; GFX9-NEXT:    v_mov_b32_e32 v1, 0
907; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
908; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
909; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
910; GFX9-NEXT:    s_not_b64 exec, exec
911; GFX9-NEXT:    v_mov_b32_e32 v2, 0
912; GFX9-NEXT:    s_not_b64 exec, exec
913; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
914; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
915; GFX9-NEXT:    s_nop 1
916; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
917; GFX9-NEXT:    s_nop 1
918; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
919; GFX9-NEXT:    s_nop 1
920; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
921; GFX9-NEXT:    s_nop 1
922; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
923; GFX9-NEXT:    s_nop 1
924; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
925; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
926; GFX9-NEXT:    s_nop 0
927; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
928; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
929; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
930; GFX9-NEXT:    ; implicit-def: $vgpr0
931; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
932; GFX9-NEXT:    s_cbranch_execz BB4_2
933; GFX9-NEXT:  ; %bb.1:
934; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
935; GFX9-NEXT:    v_mov_b32_e32 v3, s4
936; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
937; GFX9-NEXT:    ds_add_rtn_u32 v0, v0, v3
938; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
939; GFX9-NEXT:    buffer_wbinvl1_vol
940; GFX9-NEXT:  BB4_2:
941; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
942; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
943; GFX9-NEXT:    v_mov_b32_e32 v0, v1
944; GFX9-NEXT:    v_add_u32_e32 v0, s2, v0
945; GFX9-NEXT:    s_mov_b32 s3, 0xf000
946; GFX9-NEXT:    s_mov_b32 s2, -1
947; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
948; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
949; GFX9-NEXT:    s_endpgm
950;
951; GFX1064-LABEL: add_i32_varying_gfx1064:
952; GFX1064:       ; %bb.0: ; %entry
953; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
954; GFX1064-NEXT:    s_not_b64 exec, exec
955; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
956; GFX1064-NEXT:    s_not_b64 exec, exec
957; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
958; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
959; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
960; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
961; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
962; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
963; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
964; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
965; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
966; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
967; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
968; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
969; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
970; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
971; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
972; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
973; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
974; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
975; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
976; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
977; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
978; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
979; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
980; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
981; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
982; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
983; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
984; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
985; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
986; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
987; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
988; GFX1064-NEXT:    s_mov_b32 s2, -1
989; GFX1064-NEXT:    ; implicit-def: $vgpr0
990; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
991; GFX1064-NEXT:    s_cbranch_execz BB4_2
992; GFX1064-NEXT:  ; %bb.1:
993; GFX1064-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
994; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
995; GFX1064-NEXT:    s_mov_b32 s3, s7
996; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
997; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
998; GFX1064-NEXT:    ds_add_rtn_u32 v0, v7, v4
999; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1000; GFX1064-NEXT:    buffer_gl0_inv
1001; GFX1064-NEXT:    buffer_gl1_inv
1002; GFX1064-NEXT:  BB4_2:
1003; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
1004; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
1005; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
1006; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
1007; GFX1064-NEXT:    v_add_nc_u32_e32 v0, s3, v0
1008; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
1009; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1010; GFX1064-NEXT:    s_nop 0
1011; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1012; GFX1064-NEXT:    s_endpgm
1013;
1014; GFX1032-LABEL: add_i32_varying_gfx1064:
1015; GFX1032:       ; %bb.0: ; %entry
1016; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
1017; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
1018; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
1019; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
1020; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
1021; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
1022; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
1023; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
1024; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
1025; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
1026; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
1027; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
1028; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1029; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
1030; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
1031; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
1032; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
1033; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
1034; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
1035; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
1036; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
1037; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
1038; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
1039; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
1040; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1041; GFX1032-NEXT:    s_mov_b32 s2, -1
1042; GFX1032-NEXT:    ; implicit-def: $vgpr0
1043; GFX1032-NEXT:    ; implicit-def: $vcc_hi
1044; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
1045; GFX1032-NEXT:    s_cbranch_execz BB4_2
1046; GFX1032-NEXT:  ; %bb.1:
1047; GFX1032-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
1048; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
1049; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1050; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1051; GFX1032-NEXT:    ds_add_rtn_u32 v0, v7, v4
1052; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1053; GFX1032-NEXT:    buffer_gl0_inv
1054; GFX1032-NEXT:    buffer_gl1_inv
1055; GFX1032-NEXT:  BB4_2:
1056; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
1057; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
1058; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
1059; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
1060; GFX1032-NEXT:    v_add_nc_u32_e32 v0, s3, v0
1061; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1062; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1063; GFX1032-NEXT:    s_nop 0
1064; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1065; GFX1032-NEXT:    s_endpgm
1066entry:
1067  %lane = call i32 @llvm.amdgcn.workitem.id.x()
1068  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel
1069  store i32 %old, i32 addrspace(1)* %out
1070  ret void
1071}
1072
1073define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) {
1074;
1075;
1076; GFX7LESS-LABEL: add_i64_constant:
1077; GFX7LESS:       ; %bb.0: ; %entry
1078; GFX7LESS-NEXT:    s_mov_b64 s[4:5], exec
1079; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1080; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s4, 0
1081; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s5, v0
1082; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1083; GFX7LESS-NEXT:    ; implicit-def: $vgpr1_vgpr2
1084; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1085; GFX7LESS-NEXT:    s_cbranch_execz BB5_2
1086; GFX7LESS-NEXT:  ; %bb.1:
1087; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1088; GFX7LESS-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1089; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e64 v2, s4, 5
1090; GFX7LESS-NEXT:    v_mul_u32_u24_e64 v1, s4, 5
1091; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1092; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1093; GFX7LESS-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1094; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1095; GFX7LESS-NEXT:    buffer_wbinvl1
1096; GFX7LESS-NEXT:  BB5_2:
1097; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
1098; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v1
1099; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v2
1100; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v0
1101; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1102; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
1103; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
1104; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
1105; GFX7LESS-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
1106; GFX7LESS-NEXT:    s_mov_b32 s2, -1
1107; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1108; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1109; GFX7LESS-NEXT:    s_endpgm
1110;
1111; GFX8-LABEL: add_i64_constant:
1112; GFX8:       ; %bb.0: ; %entry
1113; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1114; GFX8-NEXT:    s_mov_b64 s[4:5], exec
1115; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
1116; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
1117; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1118; GFX8-NEXT:    ; implicit-def: $vgpr1_vgpr2
1119; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1120; GFX8-NEXT:    s_cbranch_execz BB5_2
1121; GFX8-NEXT:  ; %bb.1:
1122; GFX8-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1123; GFX8-NEXT:    v_mul_hi_u32_u24_e64 v2, s4, 5
1124; GFX8-NEXT:    v_mul_u32_u24_e64 v1, s4, 5
1125; GFX8-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1126; GFX8-NEXT:    s_mov_b32 m0, -1
1127; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1128; GFX8-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1129; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1130; GFX8-NEXT:    buffer_wbinvl1_vol
1131; GFX8-NEXT:  BB5_2:
1132; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
1133; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
1134; GFX8-NEXT:    v_readfirstlane_b32 s3, v2
1135; GFX8-NEXT:    v_mov_b32_e32 v1, s2
1136; GFX8-NEXT:    v_mov_b32_e32 v2, s3
1137; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v0, 5, v[1:2]
1138; GFX8-NEXT:    s_mov_b32 s3, 0xf000
1139; GFX8-NEXT:    s_mov_b32 s2, -1
1140; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1141; GFX8-NEXT:    s_nop 1
1142; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1143; GFX8-NEXT:    s_endpgm
1144;
1145; GFX9-LABEL: add_i64_constant:
1146; GFX9:       ; %bb.0: ; %entry
1147; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1148; GFX9-NEXT:    s_mov_b64 s[4:5], exec
1149; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
1150; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
1151; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1152; GFX9-NEXT:    ; implicit-def: $vgpr1_vgpr2
1153; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1154; GFX9-NEXT:    s_cbranch_execz BB5_2
1155; GFX9-NEXT:  ; %bb.1:
1156; GFX9-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1157; GFX9-NEXT:    v_mul_hi_u32_u24_e64 v2, s4, 5
1158; GFX9-NEXT:    v_mul_u32_u24_e64 v1, s4, 5
1159; GFX9-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1160; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1161; GFX9-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1162; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1163; GFX9-NEXT:    buffer_wbinvl1_vol
1164; GFX9-NEXT:  BB5_2:
1165; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
1166; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
1167; GFX9-NEXT:    v_readfirstlane_b32 s3, v2
1168; GFX9-NEXT:    v_mov_b32_e32 v1, s2
1169; GFX9-NEXT:    v_mov_b32_e32 v2, s3
1170; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v0, 5, v[1:2]
1171; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1172; GFX9-NEXT:    s_mov_b32 s2, -1
1173; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1174; GFX9-NEXT:    s_nop 1
1175; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1176; GFX9-NEXT:    s_endpgm
1177;
1178; GFX1064-LABEL: add_i64_constant:
1179; GFX1064:       ; %bb.0: ; %entry
1180; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1181; GFX1064-NEXT:    s_mov_b64 s[4:5], exec
1182; GFX1064-NEXT:    ; implicit-def: $vgpr1_vgpr2
1183; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s4, 0
1184; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s5, v0
1185; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1186; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1187; GFX1064-NEXT:    s_cbranch_execz BB5_2
1188; GFX1064-NEXT:  ; %bb.1:
1189; GFX1064-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1190; GFX1064-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1191; GFX1064-NEXT:    v_mul_hi_u32_u24_e64 v2, s4, 5
1192; GFX1064-NEXT:    v_mul_u32_u24_e64 v1, s4, 5
1193; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1194; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1195; GFX1064-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1196; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1197; GFX1064-NEXT:    buffer_gl0_inv
1198; GFX1064-NEXT:    buffer_gl1_inv
1199; GFX1064-NEXT:  BB5_2:
1200; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
1201; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
1202; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
1203; GFX1064-NEXT:    v_readfirstlane_b32 s3, v2
1204; GFX1064-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v0, 5, s[2:3]
1205; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
1206; GFX1064-NEXT:    s_mov_b32 s2, -1
1207; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1208; GFX1064-NEXT:    s_nop 1
1209; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1210; GFX1064-NEXT:    s_endpgm
1211;
1212; GFX1032-LABEL: add_i64_constant:
1213; GFX1032:       ; %bb.0: ; %entry
1214; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1215; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
1216; GFX1032-NEXT:    ; implicit-def: $vcc_hi
1217; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s3, 0
1218; GFX1032-NEXT:    ; implicit-def: $vgpr1_vgpr2
1219; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1220; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
1221; GFX1032-NEXT:    s_cbranch_execz BB5_2
1222; GFX1032-NEXT:  ; %bb.1:
1223; GFX1032-NEXT:    s_bcnt1_i32_b32 s3, s3
1224; GFX1032-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1225; GFX1032-NEXT:    v_mul_hi_u32_u24_e64 v2, s3, 5
1226; GFX1032-NEXT:    v_mul_u32_u24_e64 v1, s3, 5
1227; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1228; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1229; GFX1032-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1230; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1231; GFX1032-NEXT:    buffer_gl0_inv
1232; GFX1032-NEXT:    buffer_gl1_inv
1233; GFX1032-NEXT:  BB5_2:
1234; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
1235; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
1236; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
1237; GFX1032-NEXT:    v_readfirstlane_b32 s3, v2
1238; GFX1032-NEXT:    v_mad_u64_u32 v[0:1], s2, v0, 5, s[2:3]
1239; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1240; GFX1032-NEXT:    s_mov_b32 s2, -1
1241; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1242; GFX1032-NEXT:    s_nop 1
1243; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1244; GFX1032-NEXT:    s_endpgm
1245entry:
1246  %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 5 acq_rel
1247  store i64 %old, i64 addrspace(1)* %out
1248  ret void
1249}
1250
1251define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive) {
1252;
1253;
1254; GFX7LESS-LABEL: add_i64_uniform:
1255; GFX7LESS:       ; %bb.0: ; %entry
1256; GFX7LESS-NEXT:    s_mov_b64 s[6:7], exec
1257; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1258; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
1259; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s7, v0
1260; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1261; GFX7LESS-NEXT:    ; implicit-def: $vgpr1_vgpr2
1262; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1263; GFX7LESS-NEXT:    s_cbranch_execz BB6_2
1264; GFX7LESS-NEXT:  ; %bb.1:
1265; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1266; GFX7LESS-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1267; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1268; GFX7LESS-NEXT:    s_mul_i32 s7, s3, s6
1269; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
1270; GFX7LESS-NEXT:    v_mul_hi_u32 v1, s2, v1
1271; GFX7LESS-NEXT:    s_mul_i32 s6, s2, s6
1272; GFX7LESS-NEXT:    v_add_i32_e32 v2, vcc, s7, v1
1273; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
1274; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1275; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1276; GFX7LESS-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1277; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1278; GFX7LESS-NEXT:    buffer_wbinvl1
1279; GFX7LESS-NEXT:  BB6_2:
1280; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
1281; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
1282; GFX7LESS-NEXT:    s_mov_b32 s6, -1
1283; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1284; GFX7LESS-NEXT:    s_mov_b32 s4, s0
1285; GFX7LESS-NEXT:    s_mov_b32 s5, s1
1286; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v1
1287; GFX7LESS-NEXT:    v_readfirstlane_b32 s1, v2
1288; GFX7LESS-NEXT:    v_mul_lo_u32 v1, s3, v0
1289; GFX7LESS-NEXT:    v_mul_hi_u32 v2, s2, v0
1290; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s2, v0
1291; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
1292; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s1
1293; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
1294; GFX7LESS-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
1295; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1296; GFX7LESS-NEXT:    s_endpgm
1297;
1298; GFX8-LABEL: add_i64_uniform:
1299; GFX8:       ; %bb.0: ; %entry
1300; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1301; GFX8-NEXT:    s_mov_b64 s[6:7], exec
1302; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1303; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
1304; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1305; GFX8-NEXT:    ; implicit-def: $vgpr1_vgpr2
1306; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1307; GFX8-NEXT:    s_cbranch_execz BB6_2
1308; GFX8-NEXT:  ; %bb.1:
1309; GFX8-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1310; GFX8-NEXT:    v_mov_b32_e32 v1, s6
1311; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1312; GFX8-NEXT:    v_mul_hi_u32 v1, s2, v1
1313; GFX8-NEXT:    s_mul_i32 s7, s3, s6
1314; GFX8-NEXT:    s_mul_i32 s6, s2, s6
1315; GFX8-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1316; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s7, v1
1317; GFX8-NEXT:    v_mov_b32_e32 v1, s6
1318; GFX8-NEXT:    s_mov_b32 m0, -1
1319; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1320; GFX8-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1321; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1322; GFX8-NEXT:    buffer_wbinvl1_vol
1323; GFX8-NEXT:  BB6_2:
1324; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1325; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1326; GFX8-NEXT:    s_mov_b32 s4, s0
1327; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
1328; GFX8-NEXT:    v_mul_lo_u32 v1, s3, v0
1329; GFX8-NEXT:    v_mul_hi_u32 v3, s2, v0
1330; GFX8-NEXT:    v_mul_lo_u32 v0, s2, v0
1331; GFX8-NEXT:    s_mov_b32 s5, s1
1332; GFX8-NEXT:    v_readfirstlane_b32 s1, v2
1333; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
1334; GFX8-NEXT:    v_mov_b32_e32 v2, s1
1335; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
1336; GFX8-NEXT:    s_mov_b32 s7, 0xf000
1337; GFX8-NEXT:    s_mov_b32 s6, -1
1338; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
1339; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1340; GFX8-NEXT:    s_endpgm
1341;
1342; GFX9-LABEL: add_i64_uniform:
1343; GFX9:       ; %bb.0: ; %entry
1344; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1345; GFX9-NEXT:    s_mov_b64 s[6:7], exec
1346; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1347; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
1348; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1349; GFX9-NEXT:    ; implicit-def: $vgpr1_vgpr2
1350; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1351; GFX9-NEXT:    s_cbranch_execz BB6_2
1352; GFX9-NEXT:  ; %bb.1:
1353; GFX9-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1354; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1355; GFX9-NEXT:    s_mul_i32 s7, s3, s6
1356; GFX9-NEXT:    s_mul_hi_u32 s8, s2, s6
1357; GFX9-NEXT:    s_add_i32 s8, s8, s7
1358; GFX9-NEXT:    s_mul_i32 s6, s2, s6
1359; GFX9-NEXT:    v_mov_b32_e32 v1, s6
1360; GFX9-NEXT:    v_mov_b32_e32 v2, s8
1361; GFX9-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1362; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1363; GFX9-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1364; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1365; GFX9-NEXT:    buffer_wbinvl1_vol
1366; GFX9-NEXT:  BB6_2:
1367; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
1368; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1369; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v0
1370; GFX9-NEXT:    v_mul_hi_u32 v4, s2, v0
1371; GFX9-NEXT:    v_mul_lo_u32 v0, s2, v0
1372; GFX9-NEXT:    s_mov_b32 s4, s0
1373; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
1374; GFX9-NEXT:    s_mov_b32 s5, s1
1375; GFX9-NEXT:    v_readfirstlane_b32 s1, v2
1376; GFX9-NEXT:    v_add_u32_e32 v1, v4, v3
1377; GFX9-NEXT:    v_mov_b32_e32 v2, s1
1378; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
1379; GFX9-NEXT:    s_mov_b32 s7, 0xf000
1380; GFX9-NEXT:    s_mov_b32 s6, -1
1381; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
1382; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1383; GFX9-NEXT:    s_endpgm
1384;
1385; GFX1064-LABEL: add_i64_uniform:
1386; GFX1064:       ; %bb.0: ; %entry
1387; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1388; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
1389; GFX1064-NEXT:    ; implicit-def: $vgpr1_vgpr2
1390; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
1391; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s7, v0
1392; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1393; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1394; GFX1064-NEXT:    s_cbranch_execz BB6_2
1395; GFX1064-NEXT:  ; %bb.1:
1396; GFX1064-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1397; GFX1064-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1398; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1399; GFX1064-NEXT:    s_mul_i32 s7, s3, s6
1400; GFX1064-NEXT:    s_mul_hi_u32 s8, s2, s6
1401; GFX1064-NEXT:    s_mul_i32 s6, s2, s6
1402; GFX1064-NEXT:    s_add_i32 s8, s8, s7
1403; GFX1064-NEXT:    v_mov_b32_e32 v1, s6
1404; GFX1064-NEXT:    v_mov_b32_e32 v2, s8
1405; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1406; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1407; GFX1064-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1408; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1409; GFX1064-NEXT:    buffer_gl0_inv
1410; GFX1064-NEXT:    buffer_gl1_inv
1411; GFX1064-NEXT:  BB6_2:
1412; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
1413; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
1414; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1415; GFX1064-NEXT:    v_mul_lo_u32 v3, s3, v0
1416; GFX1064-NEXT:    v_mul_hi_u32 v4, s2, v0
1417; GFX1064-NEXT:    v_mul_lo_u32 v0, s2, v0
1418; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
1419; GFX1064-NEXT:    v_readfirstlane_b32 s4, v2
1420; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
1421; GFX1064-NEXT:    v_add_nc_u32_e32 v1, v4, v3
1422; GFX1064-NEXT:    v_add_co_u32_e64 v0, vcc, s2, v0
1423; GFX1064-NEXT:    s_mov_b32 s2, -1
1424; GFX1064-NEXT:    v_add_co_ci_u32_e32 v1, vcc, s4, v1, vcc
1425; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1426; GFX1064-NEXT:    s_endpgm
1427;
1428; GFX1032-LABEL: add_i64_uniform:
1429; GFX1032:       ; %bb.0: ; %entry
1430; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1431; GFX1032-NEXT:    s_mov_b32 s5, exec_lo
1432; GFX1032-NEXT:    ; implicit-def: $vcc_hi
1433; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s5, 0
1434; GFX1032-NEXT:    ; implicit-def: $vgpr1_vgpr2
1435; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1436; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
1437; GFX1032-NEXT:    s_cbranch_execz BB6_2
1438; GFX1032-NEXT:  ; %bb.1:
1439; GFX1032-NEXT:    s_bcnt1_i32_b32 s5, s5
1440; GFX1032-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1441; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1442; GFX1032-NEXT:    s_mul_i32 s6, s3, s5
1443; GFX1032-NEXT:    s_mul_hi_u32 s7, s2, s5
1444; GFX1032-NEXT:    s_mul_i32 s5, s2, s5
1445; GFX1032-NEXT:    s_add_i32 s7, s7, s6
1446; GFX1032-NEXT:    v_mov_b32_e32 v1, s5
1447; GFX1032-NEXT:    v_mov_b32_e32 v2, s7
1448; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1449; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1450; GFX1032-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1451; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1452; GFX1032-NEXT:    buffer_gl0_inv
1453; GFX1032-NEXT:    buffer_gl1_inv
1454; GFX1032-NEXT:  BB6_2:
1455; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
1456; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1457; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1458; GFX1032-NEXT:    v_mul_lo_u32 v3, s3, v0
1459; GFX1032-NEXT:    v_mul_hi_u32 v4, s2, v0
1460; GFX1032-NEXT:    v_mul_lo_u32 v0, s2, v0
1461; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
1462; GFX1032-NEXT:    v_readfirstlane_b32 s4, v2
1463; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1464; GFX1032-NEXT:    v_add_nc_u32_e32 v1, v4, v3
1465; GFX1032-NEXT:    v_add_co_u32_e64 v0, vcc_lo, s2, v0
1466; GFX1032-NEXT:    s_mov_b32 s2, -1
1467; GFX1032-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo
1468; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1469; GFX1032-NEXT:    s_endpgm
1470entry:
1471  %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %additive acq_rel
1472  store i64 %old, i64 addrspace(1)* %out
1473  ret void
1474}
1475
1476; GCN-NOT: v_mbcnt_lo_u32_b32
1477; GCN-NOT: v_mbcnt_hi_u32_b32
1478; GCN-NOT: s_bcnt1_i32_b64
1479define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out) {
1480;
1481;
1482; GFX7LESS-LABEL: add_i64_varying:
1483; GFX7LESS:       ; %bb.0: ; %entry
1484; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1485; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
1486; GFX7LESS-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
1487; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1488; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1489; GFX7LESS-NEXT:    ds_add_rtn_u64 v[0:1], v2, v[0:1]
1490; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1491; GFX7LESS-NEXT:    buffer_wbinvl1
1492; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
1493; GFX7LESS-NEXT:    s_mov_b32 s2, -1
1494; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1495; GFX7LESS-NEXT:    s_endpgm
1496;
1497; GFX8-LABEL: add_i64_varying:
1498; GFX8:       ; %bb.0: ; %entry
1499; GFX8-NEXT:    v_mov_b32_e32 v1, 0
1500; GFX8-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
1501; GFX8-NEXT:    s_mov_b32 m0, -1
1502; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1503; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1504; GFX8-NEXT:    ds_add_rtn_u64 v[0:1], v2, v[0:1]
1505; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1506; GFX8-NEXT:    buffer_wbinvl1_vol
1507; GFX8-NEXT:    s_mov_b32 s3, 0xf000
1508; GFX8-NEXT:    s_mov_b32 s2, -1
1509; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1510; GFX8-NEXT:    s_endpgm
1511;
1512; GFX9-LABEL: add_i64_varying:
1513; GFX9:       ; %bb.0: ; %entry
1514; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1515; GFX9-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
1516; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1517; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1518; GFX9-NEXT:    ds_add_rtn_u64 v[0:1], v2, v[0:1]
1519; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1520; GFX9-NEXT:    buffer_wbinvl1_vol
1521; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1522; GFX9-NEXT:    s_mov_b32 s2, -1
1523; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1524; GFX9-NEXT:    s_endpgm
1525;
1526; GFX1064-LABEL: add_i64_varying:
1527; GFX1064:       ; %bb.0: ; %entry
1528; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
1529; GFX1064-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
1530; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1531; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
1532; GFX1064-NEXT:    s_mov_b32 s2, -1
1533; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1534; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1535; GFX1064-NEXT:    ds_add_rtn_u64 v[0:1], v2, v[0:1]
1536; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1537; GFX1064-NEXT:    buffer_gl0_inv
1538; GFX1064-NEXT:    buffer_gl1_inv
1539; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1540; GFX1064-NEXT:    s_endpgm
1541;
1542; GFX1032-LABEL: add_i64_varying:
1543; GFX1032:       ; %bb.0: ; %entry
1544; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
1545; GFX1032-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
1546; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1547; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1548; GFX1032-NEXT:    s_mov_b32 s2, -1
1549; GFX1032-NEXT:    ; implicit-def: $vcc_hi
1550; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1551; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1552; GFX1032-NEXT:    ds_add_rtn_u64 v[0:1], v2, v[0:1]
1553; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1554; GFX1032-NEXT:    buffer_gl0_inv
1555; GFX1032-NEXT:    buffer_gl1_inv
1556; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1557; GFX1032-NEXT:    s_endpgm
1558entry:
1559  %lane = call i32 @llvm.amdgcn.workitem.id.x()
1560  %zext = zext i32 %lane to i64
1561  %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %zext acq_rel
1562  store i64 %old, i64 addrspace(1)* %out
1563  ret void
1564}
1565
1566define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) {
1567;
1568;
1569; GFX7LESS-LABEL: sub_i32_constant:
1570; GFX7LESS:       ; %bb.0: ; %entry
1571; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
1572; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1573; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
1574; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
1575; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1576; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
1577; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1578; GFX7LESS-NEXT:    s_cbranch_execz BB8_2
1579; GFX7LESS-NEXT:  ; %bb.1:
1580; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1581; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1582; GFX7LESS-NEXT:    v_mul_u32_u24_e64 v2, s2, 5
1583; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1584; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1585; GFX7LESS-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1586; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1587; GFX7LESS-NEXT:    buffer_wbinvl1
1588; GFX7LESS-NEXT:  BB8_2:
1589; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
1590; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v1
1591; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1592; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
1593; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
1594; GFX7LESS-NEXT:    s_mov_b32 s2, -1
1595; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1596; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1597; GFX7LESS-NEXT:    s_endpgm
1598;
1599; GFX8-LABEL: sub_i32_constant:
1600; GFX8:       ; %bb.0: ; %entry
1601; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1602; GFX8-NEXT:    s_mov_b64 s[2:3], exec
1603; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1604; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1605; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1606; GFX8-NEXT:    ; implicit-def: $vgpr1
1607; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1608; GFX8-NEXT:    s_cbranch_execz BB8_2
1609; GFX8-NEXT:  ; %bb.1:
1610; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1611; GFX8-NEXT:    v_mul_u32_u24_e64 v1, s2, 5
1612; GFX8-NEXT:    v_mov_b32_e32 v2, local_var32@abs32@lo
1613; GFX8-NEXT:    s_mov_b32 m0, -1
1614; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1615; GFX8-NEXT:    ds_sub_rtn_u32 v1, v2, v1
1616; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1617; GFX8-NEXT:    buffer_wbinvl1_vol
1618; GFX8-NEXT:  BB8_2:
1619; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1620; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
1621; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1622; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s2, v0
1623; GFX8-NEXT:    s_mov_b32 s3, 0xf000
1624; GFX8-NEXT:    s_mov_b32 s2, -1
1625; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1626; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1627; GFX8-NEXT:    s_endpgm
1628;
1629; GFX9-LABEL: sub_i32_constant:
1630; GFX9:       ; %bb.0: ; %entry
1631; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1632; GFX9-NEXT:    s_mov_b64 s[2:3], exec
1633; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1634; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1635; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1636; GFX9-NEXT:    ; implicit-def: $vgpr1
1637; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1638; GFX9-NEXT:    s_cbranch_execz BB8_2
1639; GFX9-NEXT:  ; %bb.1:
1640; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1641; GFX9-NEXT:    v_mul_u32_u24_e64 v1, s2, 5
1642; GFX9-NEXT:    v_mov_b32_e32 v2, local_var32@abs32@lo
1643; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1644; GFX9-NEXT:    ds_sub_rtn_u32 v1, v2, v1
1645; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1646; GFX9-NEXT:    buffer_wbinvl1_vol
1647; GFX9-NEXT:  BB8_2:
1648; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
1649; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
1650; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1651; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
1652; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1653; GFX9-NEXT:    s_mov_b32 s2, -1
1654; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1655; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1656; GFX9-NEXT:    s_endpgm
1657;
1658; GFX1064-LABEL: sub_i32_constant:
1659; GFX1064:       ; %bb.0: ; %entry
1660; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1661; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
1662; GFX1064-NEXT:    ; implicit-def: $vgpr1
1663; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
1664; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s3, v0
1665; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1666; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1667; GFX1064-NEXT:    s_cbranch_execz BB8_2
1668; GFX1064-NEXT:  ; %bb.1:
1669; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1670; GFX1064-NEXT:    v_mov_b32_e32 v2, local_var32@abs32@lo
1671; GFX1064-NEXT:    v_mul_u32_u24_e64 v1, s2, 5
1672; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1673; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1674; GFX1064-NEXT:    ds_sub_rtn_u32 v1, v2, v1
1675; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1676; GFX1064-NEXT:    buffer_gl0_inv
1677; GFX1064-NEXT:    buffer_gl1_inv
1678; GFX1064-NEXT:  BB8_2:
1679; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
1680; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
1681; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
1682; GFX1064-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1683; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
1684; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1685; GFX1064-NEXT:    s_mov_b32 s2, -1
1686; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1687; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1688; GFX1064-NEXT:    s_endpgm
1689;
1690; GFX1032-LABEL: sub_i32_constant:
1691; GFX1032:       ; %bb.0: ; %entry
1692; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1693; GFX1032-NEXT:    s_mov_b32 s2, exec_lo
1694; GFX1032-NEXT:    ; implicit-def: $vcc_hi
1695; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
1696; GFX1032-NEXT:    ; implicit-def: $vgpr1
1697; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1698; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
1699; GFX1032-NEXT:    s_cbranch_execz BB8_2
1700; GFX1032-NEXT:  ; %bb.1:
1701; GFX1032-NEXT:    s_bcnt1_i32_b32 s2, s2
1702; GFX1032-NEXT:    v_mov_b32_e32 v2, local_var32@abs32@lo
1703; GFX1032-NEXT:    v_mul_u32_u24_e64 v1, s2, 5
1704; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1705; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1706; GFX1032-NEXT:    ds_sub_rtn_u32 v1, v2, v1
1707; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1708; GFX1032-NEXT:    buffer_gl0_inv
1709; GFX1032-NEXT:    buffer_gl1_inv
1710; GFX1032-NEXT:  BB8_2:
1711; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
1712; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
1713; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
1714; GFX1032-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1715; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1716; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1717; GFX1032-NEXT:    s_mov_b32 s2, -1
1718; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1719; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1720; GFX1032-NEXT:    s_endpgm
1721entry:
1722  %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 5 acq_rel
1723  store i32 %old, i32 addrspace(1)* %out
1724  ret void
1725}
1726
1727define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive) {
1728;
1729;
1730; GFX7LESS-LABEL: sub_i32_uniform:
1731; GFX7LESS:       ; %bb.0: ; %entry
1732; GFX7LESS-NEXT:    s_mov_b64 s[6:7], exec
1733; GFX7LESS-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1734; GFX7LESS-NEXT:    s_load_dword s2, s[0:1], 0xb
1735; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
1736; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s7, v0
1737; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1738; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
1739; GFX7LESS-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1740; GFX7LESS-NEXT:    s_cbranch_execz BB9_2
1741; GFX7LESS-NEXT:  ; %bb.1:
1742; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s3, s[6:7]
1743; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1744; GFX7LESS-NEXT:    s_mul_i32 s3, s2, s3
1745; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1746; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s3
1747; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1748; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1749; GFX7LESS-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1750; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1751; GFX7LESS-NEXT:    buffer_wbinvl1
1752; GFX7LESS-NEXT:  BB9_2:
1753; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[0:1]
1754; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v1
1755; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1756; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s2, v0
1757; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
1758; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
1759; GFX7LESS-NEXT:    s_mov_b32 s6, -1
1760; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1761; GFX7LESS-NEXT:    s_endpgm
1762;
1763; GFX8-LABEL: sub_i32_uniform:
1764; GFX8:       ; %bb.0: ; %entry
1765; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1766; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x2c
1767; GFX8-NEXT:    s_mov_b64 s[2:3], exec
1768; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1769; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1770; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1771; GFX8-NEXT:    ; implicit-def: $vgpr1
1772; GFX8-NEXT:    s_and_saveexec_b64 s[6:7], vcc
1773; GFX8-NEXT:    s_cbranch_execz BB9_2
1774; GFX8-NEXT:  ; %bb.1:
1775; GFX8-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
1776; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1777; GFX8-NEXT:    s_mul_i32 s1, s0, s1
1778; GFX8-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1779; GFX8-NEXT:    v_mov_b32_e32 v2, s1
1780; GFX8-NEXT:    s_mov_b32 m0, -1
1781; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1782; GFX8-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1783; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1784; GFX8-NEXT:    buffer_wbinvl1_vol
1785; GFX8-NEXT:  BB9_2:
1786; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
1787; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1788; GFX8-NEXT:    v_mul_lo_u32 v0, s0, v0
1789; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
1790; GFX8-NEXT:    s_mov_b32 s7, 0xf000
1791; GFX8-NEXT:    s_mov_b32 s6, -1
1792; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s0, v0
1793; GFX8-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1794; GFX8-NEXT:    s_endpgm
1795;
1796; GFX9-LABEL: sub_i32_uniform:
1797; GFX9:       ; %bb.0: ; %entry
1798; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1799; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x2c
1800; GFX9-NEXT:    s_mov_b64 s[2:3], exec
1801; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1802; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1803; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1804; GFX9-NEXT:    ; implicit-def: $vgpr1
1805; GFX9-NEXT:    s_and_saveexec_b64 s[6:7], vcc
1806; GFX9-NEXT:    s_cbranch_execz BB9_2
1807; GFX9-NEXT:  ; %bb.1:
1808; GFX9-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
1809; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1810; GFX9-NEXT:    s_mul_i32 s1, s0, s1
1811; GFX9-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1812; GFX9-NEXT:    v_mov_b32_e32 v2, s1
1813; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1814; GFX9-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1815; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1816; GFX9-NEXT:    buffer_wbinvl1_vol
1817; GFX9-NEXT:  BB9_2:
1818; GFX9-NEXT:    s_or_b64 exec, exec, s[6:7]
1819; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1820; GFX9-NEXT:    v_mul_lo_u32 v0, s0, v0
1821; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
1822; GFX9-NEXT:    s_mov_b32 s7, 0xf000
1823; GFX9-NEXT:    s_mov_b32 s6, -1
1824; GFX9-NEXT:    v_sub_u32_e32 v0, s0, v0
1825; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1826; GFX9-NEXT:    s_endpgm
1827;
1828; GFX1064-LABEL: sub_i32_uniform:
1829; GFX1064:       ; %bb.0: ; %entry
1830; GFX1064-NEXT:    s_clause 0x1
1831; GFX1064-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1832; GFX1064-NEXT:    s_load_dword s0, s[0:1], 0x2c
1833; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
1834; GFX1064-NEXT:    ; implicit-def: $vgpr1
1835; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
1836; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s3, v0
1837; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1838; GFX1064-NEXT:    s_and_saveexec_b64 s[6:7], vcc
1839; GFX1064-NEXT:    s_cbranch_execz BB9_2
1840; GFX1064-NEXT:  ; %bb.1:
1841; GFX1064-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
1842; GFX1064-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1843; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1844; GFX1064-NEXT:    s_mul_i32 s1, s0, s1
1845; GFX1064-NEXT:    v_mov_b32_e32 v2, s1
1846; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1847; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1848; GFX1064-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1849; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1850; GFX1064-NEXT:    buffer_gl0_inv
1851; GFX1064-NEXT:    buffer_gl1_inv
1852; GFX1064-NEXT:  BB9_2:
1853; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
1854; GFX1064-NEXT:    s_or_b64 exec, exec, s[6:7]
1855; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1856; GFX1064-NEXT:    v_mul_lo_u32 v0, s0, v0
1857; GFX1064-NEXT:    v_readfirstlane_b32 s0, v1
1858; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
1859; GFX1064-NEXT:    s_mov_b32 s6, -1
1860; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1861; GFX1064-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1862; GFX1064-NEXT:    s_endpgm
1863;
1864; GFX1032-LABEL: sub_i32_uniform:
1865; GFX1032:       ; %bb.0: ; %entry
1866; GFX1032-NEXT:    s_clause 0x1
1867; GFX1032-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1868; GFX1032-NEXT:    s_load_dword s0, s[0:1], 0x2c
1869; GFX1032-NEXT:    s_mov_b32 s2, exec_lo
1870; GFX1032-NEXT:    ; implicit-def: $vcc_hi
1871; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
1872; GFX1032-NEXT:    ; implicit-def: $vgpr1
1873; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1874; GFX1032-NEXT:    s_and_saveexec_b32 s1, vcc_lo
1875; GFX1032-NEXT:    s_cbranch_execz BB9_2
1876; GFX1032-NEXT:  ; %bb.1:
1877; GFX1032-NEXT:    s_bcnt1_i32_b32 s2, s2
1878; GFX1032-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1879; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1880; GFX1032-NEXT:    s_mul_i32 s2, s0, s2
1881; GFX1032-NEXT:    v_mov_b32_e32 v2, s2
1882; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1883; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1884; GFX1032-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1885; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1886; GFX1032-NEXT:    buffer_gl0_inv
1887; GFX1032-NEXT:    buffer_gl1_inv
1888; GFX1032-NEXT:  BB9_2:
1889; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
1890; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s1
1891; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1892; GFX1032-NEXT:    v_mul_lo_u32 v0, s0, v0
1893; GFX1032-NEXT:    v_readfirstlane_b32 s0, v1
1894; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
1895; GFX1032-NEXT:    s_mov_b32 s6, -1
1896; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1897; GFX1032-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1898; GFX1032-NEXT:    s_endpgm
1899entry:
1900  %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %subitive acq_rel
1901  store i32 %old, i32 addrspace(1)* %out
1902  ret void
1903}
1904
1905define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
1906;
1907;
1908; GFX7LESS-LABEL: sub_i32_varying:
1909; GFX7LESS:       ; %bb.0: ; %entry
1910; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1911; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1912; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1913; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1914; GFX7LESS-NEXT:    ds_sub_rtn_u32 v0, v1, v0
1915; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1916; GFX7LESS-NEXT:    buffer_wbinvl1
1917; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
1918; GFX7LESS-NEXT:    s_mov_b32 s2, -1
1919; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1920; GFX7LESS-NEXT:    s_endpgm
1921;
1922; GFX8-LABEL: sub_i32_varying:
1923; GFX8:       ; %bb.0: ; %entry
1924; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1925; GFX8-NEXT:    v_mov_b32_e32 v2, v0
1926; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
1927; GFX8-NEXT:    v_mov_b32_e32 v1, 0
1928; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
1929; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1930; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
1931; GFX8-NEXT:    s_not_b64 exec, exec
1932; GFX8-NEXT:    v_mov_b32_e32 v2, 0
1933; GFX8-NEXT:    s_not_b64 exec, exec
1934; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
1935; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
1936; GFX8-NEXT:    s_nop 1
1937; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
1938; GFX8-NEXT:    s_nop 1
1939; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
1940; GFX8-NEXT:    s_nop 1
1941; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
1942; GFX8-NEXT:    s_nop 1
1943; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
1944; GFX8-NEXT:    s_nop 1
1945; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
1946; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
1947; GFX8-NEXT:    s_nop 0
1948; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
1949; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
1950; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1951; GFX8-NEXT:    ; implicit-def: $vgpr0
1952; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1953; GFX8-NEXT:    s_cbranch_execz BB10_2
1954; GFX8-NEXT:  ; %bb.1:
1955; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
1956; GFX8-NEXT:    v_mov_b32_e32 v3, s4
1957; GFX8-NEXT:    s_mov_b32 m0, -1
1958; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1959; GFX8-NEXT:    ds_sub_rtn_u32 v0, v0, v3
1960; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1961; GFX8-NEXT:    buffer_wbinvl1_vol
1962; GFX8-NEXT:  BB10_2:
1963; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
1964; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
1965; GFX8-NEXT:    v_mov_b32_e32 v0, v1
1966; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s2, v0
1967; GFX8-NEXT:    s_mov_b32 s3, 0xf000
1968; GFX8-NEXT:    s_mov_b32 s2, -1
1969; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1970; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1971; GFX8-NEXT:    s_endpgm
1972;
1973; GFX9-LABEL: sub_i32_varying:
1974; GFX9:       ; %bb.0: ; %entry
1975; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1976; GFX9-NEXT:    v_mov_b32_e32 v2, v0
1977; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
1978; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1979; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
1980; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1981; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
1982; GFX9-NEXT:    s_not_b64 exec, exec
1983; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1984; GFX9-NEXT:    s_not_b64 exec, exec
1985; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
1986; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
1987; GFX9-NEXT:    s_nop 1
1988; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
1989; GFX9-NEXT:    s_nop 1
1990; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
1991; GFX9-NEXT:    s_nop 1
1992; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
1993; GFX9-NEXT:    s_nop 1
1994; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
1995; GFX9-NEXT:    s_nop 1
1996; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
1997; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
1998; GFX9-NEXT:    s_nop 0
1999; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
2000; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
2001; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2002; GFX9-NEXT:    ; implicit-def: $vgpr0
2003; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2004; GFX9-NEXT:    s_cbranch_execz BB10_2
2005; GFX9-NEXT:  ; %bb.1:
2006; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
2007; GFX9-NEXT:    v_mov_b32_e32 v3, s4
2008; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2009; GFX9-NEXT:    ds_sub_rtn_u32 v0, v0, v3
2010; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2011; GFX9-NEXT:    buffer_wbinvl1_vol
2012; GFX9-NEXT:  BB10_2:
2013; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
2014; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
2015; GFX9-NEXT:    v_mov_b32_e32 v0, v1
2016; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
2017; GFX9-NEXT:    s_mov_b32 s3, 0xf000
2018; GFX9-NEXT:    s_mov_b32 s2, -1
2019; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2020; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2021; GFX9-NEXT:    s_endpgm
2022;
2023; GFX1064-LABEL: sub_i32_varying:
2024; GFX1064:       ; %bb.0: ; %entry
2025; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
2026; GFX1064-NEXT:    s_not_b64 exec, exec
2027; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
2028; GFX1064-NEXT:    s_not_b64 exec, exec
2029; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2030; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
2031; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
2032; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
2033; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
2034; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
2035; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
2036; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2037; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2038; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
2039; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
2040; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
2041; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
2042; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2043; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2044; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2045; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2046; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
2047; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
2048; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2049; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
2050; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2051; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
2052; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
2053; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
2054; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2055; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
2056; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
2057; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
2058; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
2059; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2060; GFX1064-NEXT:    s_mov_b32 s2, -1
2061; GFX1064-NEXT:    ; implicit-def: $vgpr0
2062; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2063; GFX1064-NEXT:    s_cbranch_execz BB10_2
2064; GFX1064-NEXT:  ; %bb.1:
2065; GFX1064-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
2066; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
2067; GFX1064-NEXT:    s_mov_b32 s3, s7
2068; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2069; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2070; GFX1064-NEXT:    ds_sub_rtn_u32 v0, v7, v4
2071; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2072; GFX1064-NEXT:    buffer_gl0_inv
2073; GFX1064-NEXT:    buffer_gl1_inv
2074; GFX1064-NEXT:  BB10_2:
2075; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
2076; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
2077; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
2078; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
2079; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s3, v0
2080; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
2081; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2082; GFX1064-NEXT:    s_nop 0
2083; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2084; GFX1064-NEXT:    s_endpgm
2085;
2086; GFX1032-LABEL: sub_i32_varying:
2087; GFX1032:       ; %bb.0: ; %entry
2088; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
2089; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2090; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
2091; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2092; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
2093; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
2094; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
2095; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
2096; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
2097; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
2098; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2099; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
2100; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2101; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
2102; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2103; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
2104; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
2105; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
2106; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2107; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
2108; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
2109; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
2110; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
2111; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
2112; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
2113; GFX1032-NEXT:    s_mov_b32 s2, -1
2114; GFX1032-NEXT:    ; implicit-def: $vgpr0
2115; GFX1032-NEXT:    ; implicit-def: $vcc_hi
2116; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
2117; GFX1032-NEXT:    s_cbranch_execz BB10_2
2118; GFX1032-NEXT:  ; %bb.1:
2119; GFX1032-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
2120; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
2121; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2122; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2123; GFX1032-NEXT:    ds_sub_rtn_u32 v0, v7, v4
2124; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2125; GFX1032-NEXT:    buffer_gl0_inv
2126; GFX1032-NEXT:    buffer_gl1_inv
2127; GFX1032-NEXT:  BB10_2:
2128; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
2129; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
2130; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
2131; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
2132; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, s3, v0
2133; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
2134; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2135; GFX1032-NEXT:    s_nop 0
2136; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2137; GFX1032-NEXT:    s_endpgm
2138entry:
2139  %lane = call i32 @llvm.amdgcn.workitem.id.x()
2140  %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %lane acq_rel
2141  store i32 %old, i32 addrspace(1)* %out
2142  ret void
2143}
2144
2145define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) {
2146;
2147;
2148; GFX7LESS-LABEL: sub_i64_constant:
2149; GFX7LESS:       ; %bb.0: ; %entry
2150; GFX7LESS-NEXT:    s_mov_b64 s[4:5], exec
2151; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2152; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s4, 0
2153; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s5, v0
2154; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2155; GFX7LESS-NEXT:    ; implicit-def: $vgpr1_vgpr2
2156; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2157; GFX7LESS-NEXT:    s_cbranch_execz BB11_2
2158; GFX7LESS-NEXT:  ; %bb.1:
2159; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
2160; GFX7LESS-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2161; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e64 v2, s4, 5
2162; GFX7LESS-NEXT:    v_mul_u32_u24_e64 v1, s4, 5
2163; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2164; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2165; GFX7LESS-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2166; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2167; GFX7LESS-NEXT:    buffer_wbinvl1
2168; GFX7LESS-NEXT:  BB11_2:
2169; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
2170; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v1
2171; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v2
2172; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v0
2173; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
2174; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
2175; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
2176; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
2177; GFX7LESS-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
2178; GFX7LESS-NEXT:    s_mov_b32 s2, -1
2179; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2180; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2181; GFX7LESS-NEXT:    s_endpgm
2182;
2183; GFX8-LABEL: sub_i64_constant:
2184; GFX8:       ; %bb.0: ; %entry
2185; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2186; GFX8-NEXT:    s_mov_b64 s[4:5], exec
2187; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
2188; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
2189; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2190; GFX8-NEXT:    ; implicit-def: $vgpr1_vgpr2
2191; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2192; GFX8-NEXT:    s_cbranch_execz BB11_2
2193; GFX8-NEXT:  ; %bb.1:
2194; GFX8-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
2195; GFX8-NEXT:    v_mul_hi_u32_u24_e64 v2, s4, 5
2196; GFX8-NEXT:    v_mul_u32_u24_e64 v1, s4, 5
2197; GFX8-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2198; GFX8-NEXT:    s_mov_b32 m0, -1
2199; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2200; GFX8-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2201; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2202; GFX8-NEXT:    buffer_wbinvl1_vol
2203; GFX8-NEXT:  BB11_2:
2204; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
2205; GFX8-NEXT:    v_readfirstlane_b32 s3, v2
2206; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
2207; GFX8-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v0
2208; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
2209; GFX8-NEXT:    v_mov_b32_e32 v2, s3
2210; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s2, v0
2211; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
2212; GFX8-NEXT:    s_mov_b32 s3, 0xf000
2213; GFX8-NEXT:    s_mov_b32 s2, -1
2214; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2215; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2216; GFX8-NEXT:    s_endpgm
2217;
2218; GFX9-LABEL: sub_i64_constant:
2219; GFX9:       ; %bb.0: ; %entry
2220; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2221; GFX9-NEXT:    s_mov_b64 s[4:5], exec
2222; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
2223; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
2224; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2225; GFX9-NEXT:    ; implicit-def: $vgpr1_vgpr2
2226; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2227; GFX9-NEXT:    s_cbranch_execz BB11_2
2228; GFX9-NEXT:  ; %bb.1:
2229; GFX9-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
2230; GFX9-NEXT:    v_mul_hi_u32_u24_e64 v2, s4, 5
2231; GFX9-NEXT:    v_mul_u32_u24_e64 v1, s4, 5
2232; GFX9-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2233; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2234; GFX9-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2235; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2236; GFX9-NEXT:    buffer_wbinvl1_vol
2237; GFX9-NEXT:  BB11_2:
2238; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
2239; GFX9-NEXT:    v_readfirstlane_b32 s3, v2
2240; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
2241; GFX9-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v0
2242; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
2243; GFX9-NEXT:    v_mov_b32_e32 v2, s3
2244; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s2, v0
2245; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
2246; GFX9-NEXT:    s_mov_b32 s3, 0xf000
2247; GFX9-NEXT:    s_mov_b32 s2, -1
2248; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2249; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2250; GFX9-NEXT:    s_endpgm
2251;
2252; GFX1064-LABEL: sub_i64_constant:
2253; GFX1064:       ; %bb.0: ; %entry
2254; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2255; GFX1064-NEXT:    s_mov_b64 s[4:5], exec
2256; GFX1064-NEXT:    ; implicit-def: $vgpr1_vgpr2
2257; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s4, 0
2258; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s5, v0
2259; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2260; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2261; GFX1064-NEXT:    s_cbranch_execz BB11_2
2262; GFX1064-NEXT:  ; %bb.1:
2263; GFX1064-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
2264; GFX1064-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2265; GFX1064-NEXT:    v_mul_hi_u32_u24_e64 v2, s4, 5
2266; GFX1064-NEXT:    v_mul_u32_u24_e64 v1, s4, 5
2267; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2268; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2269; GFX1064-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2270; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2271; GFX1064-NEXT:    buffer_gl0_inv
2272; GFX1064-NEXT:    buffer_gl1_inv
2273; GFX1064-NEXT:  BB11_2:
2274; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
2275; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
2276; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
2277; GFX1064-NEXT:    v_mul_u32_u24_e32 v1, 5, v0
2278; GFX1064-NEXT:    v_readfirstlane_b32 s3, v2
2279; GFX1064-NEXT:    v_mul_hi_u32_u24_e32 v2, 5, v0
2280; GFX1064-NEXT:    v_sub_co_u32_e64 v0, vcc, s2, v1
2281; GFX1064-NEXT:    s_mov_b32 s2, -1
2282; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc
2283; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
2284; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2285; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2286; GFX1064-NEXT:    s_endpgm
2287;
2288; GFX1032-LABEL: sub_i64_constant:
2289; GFX1032:       ; %bb.0: ; %entry
2290; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2291; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
2292; GFX1032-NEXT:    ; implicit-def: $vcc_hi
2293; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s3, 0
2294; GFX1032-NEXT:    ; implicit-def: $vgpr1_vgpr2
2295; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
2296; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
2297; GFX1032-NEXT:    s_cbranch_execz BB11_2
2298; GFX1032-NEXT:  ; %bb.1:
2299; GFX1032-NEXT:    s_bcnt1_i32_b32 s3, s3
2300; GFX1032-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2301; GFX1032-NEXT:    v_mul_hi_u32_u24_e64 v2, s3, 5
2302; GFX1032-NEXT:    v_mul_u32_u24_e64 v1, s3, 5
2303; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2304; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2305; GFX1032-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2306; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2307; GFX1032-NEXT:    buffer_gl0_inv
2308; GFX1032-NEXT:    buffer_gl1_inv
2309; GFX1032-NEXT:  BB11_2:
2310; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
2311; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
2312; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
2313; GFX1032-NEXT:    v_mul_u32_u24_e32 v1, 5, v0
2314; GFX1032-NEXT:    v_readfirstlane_b32 s3, v2
2315; GFX1032-NEXT:    v_mul_hi_u32_u24_e32 v2, 5, v0
2316; GFX1032-NEXT:    v_sub_co_u32_e64 v0, vcc_lo, s2, v1
2317; GFX1032-NEXT:    s_mov_b32 s2, -1
2318; GFX1032-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo
2319; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
2320; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2321; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2322; GFX1032-NEXT:    s_endpgm
2323entry:
2324  %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 5 acq_rel
2325  store i64 %old, i64 addrspace(1)* %out
2326  ret void
2327}
2328
2329define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive) {
2330;
2331;
2332; GFX7LESS-LABEL: sub_i64_uniform:
2333; GFX7LESS:       ; %bb.0: ; %entry
2334; GFX7LESS-NEXT:    s_mov_b64 s[6:7], exec
2335; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
2336; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
2337; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s7, v0
2338; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2339; GFX7LESS-NEXT:    ; implicit-def: $vgpr1_vgpr2
2340; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2341; GFX7LESS-NEXT:    s_cbranch_execz BB12_2
2342; GFX7LESS-NEXT:  ; %bb.1:
2343; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
2344; GFX7LESS-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2345; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2346; GFX7LESS-NEXT:    s_mul_i32 s7, s3, s6
2347; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
2348; GFX7LESS-NEXT:    v_mul_hi_u32 v1, s2, v1
2349; GFX7LESS-NEXT:    s_mul_i32 s6, s2, s6
2350; GFX7LESS-NEXT:    v_add_i32_e32 v2, vcc, s7, v1
2351; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
2352; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2353; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2354; GFX7LESS-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2355; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2356; GFX7LESS-NEXT:    buffer_wbinvl1
2357; GFX7LESS-NEXT:  BB12_2:
2358; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
2359; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
2360; GFX7LESS-NEXT:    s_mov_b32 s6, -1
2361; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2362; GFX7LESS-NEXT:    s_mov_b32 s4, s0
2363; GFX7LESS-NEXT:    s_mov_b32 s5, s1
2364; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v1
2365; GFX7LESS-NEXT:    v_readfirstlane_b32 s1, v2
2366; GFX7LESS-NEXT:    v_mul_lo_u32 v1, s3, v0
2367; GFX7LESS-NEXT:    v_mul_hi_u32 v2, s2, v0
2368; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s2, v0
2369; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
2370; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s1
2371; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
2372; GFX7LESS-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
2373; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2374; GFX7LESS-NEXT:    s_endpgm
2375;
2376; GFX8-LABEL: sub_i64_uniform:
2377; GFX8:       ; %bb.0: ; %entry
2378; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2379; GFX8-NEXT:    s_mov_b64 s[6:7], exec
2380; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
2381; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
2382; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2383; GFX8-NEXT:    ; implicit-def: $vgpr1_vgpr2
2384; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2385; GFX8-NEXT:    s_cbranch_execz BB12_2
2386; GFX8-NEXT:  ; %bb.1:
2387; GFX8-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
2388; GFX8-NEXT:    v_mov_b32_e32 v1, s6
2389; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2390; GFX8-NEXT:    v_mul_hi_u32 v1, s2, v1
2391; GFX8-NEXT:    s_mul_i32 s7, s3, s6
2392; GFX8-NEXT:    s_mul_i32 s6, s2, s6
2393; GFX8-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2394; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s7, v1
2395; GFX8-NEXT:    v_mov_b32_e32 v1, s6
2396; GFX8-NEXT:    s_mov_b32 m0, -1
2397; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2398; GFX8-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2399; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2400; GFX8-NEXT:    buffer_wbinvl1_vol
2401; GFX8-NEXT:  BB12_2:
2402; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
2403; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2404; GFX8-NEXT:    s_mov_b32 s4, s0
2405; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
2406; GFX8-NEXT:    v_mul_lo_u32 v1, s3, v0
2407; GFX8-NEXT:    v_mul_hi_u32 v3, s2, v0
2408; GFX8-NEXT:    v_mul_lo_u32 v0, s2, v0
2409; GFX8-NEXT:    s_mov_b32 s5, s1
2410; GFX8-NEXT:    v_readfirstlane_b32 s1, v2
2411; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
2412; GFX8-NEXT:    v_mov_b32_e32 v2, s1
2413; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s0, v0
2414; GFX8-NEXT:    s_mov_b32 s7, 0xf000
2415; GFX8-NEXT:    s_mov_b32 s6, -1
2416; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
2417; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2418; GFX8-NEXT:    s_endpgm
2419;
2420; GFX9-LABEL: sub_i64_uniform:
2421; GFX9:       ; %bb.0: ; %entry
2422; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2423; GFX9-NEXT:    s_mov_b64 s[6:7], exec
2424; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
2425; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
2426; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2427; GFX9-NEXT:    ; implicit-def: $vgpr1_vgpr2
2428; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2429; GFX9-NEXT:    s_cbranch_execz BB12_2
2430; GFX9-NEXT:  ; %bb.1:
2431; GFX9-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
2432; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2433; GFX9-NEXT:    s_mul_i32 s7, s3, s6
2434; GFX9-NEXT:    s_mul_hi_u32 s8, s2, s6
2435; GFX9-NEXT:    s_add_i32 s8, s8, s7
2436; GFX9-NEXT:    s_mul_i32 s6, s2, s6
2437; GFX9-NEXT:    v_mov_b32_e32 v1, s6
2438; GFX9-NEXT:    v_mov_b32_e32 v2, s8
2439; GFX9-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2440; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2441; GFX9-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2442; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2443; GFX9-NEXT:    buffer_wbinvl1_vol
2444; GFX9-NEXT:  BB12_2:
2445; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
2446; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2447; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v0
2448; GFX9-NEXT:    v_mul_hi_u32 v4, s2, v0
2449; GFX9-NEXT:    v_mul_lo_u32 v0, s2, v0
2450; GFX9-NEXT:    s_mov_b32 s4, s0
2451; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
2452; GFX9-NEXT:    s_mov_b32 s5, s1
2453; GFX9-NEXT:    v_readfirstlane_b32 s1, v2
2454; GFX9-NEXT:    v_add_u32_e32 v1, v4, v3
2455; GFX9-NEXT:    v_mov_b32_e32 v2, s1
2456; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s0, v0
2457; GFX9-NEXT:    s_mov_b32 s7, 0xf000
2458; GFX9-NEXT:    s_mov_b32 s6, -1
2459; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
2460; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2461; GFX9-NEXT:    s_endpgm
2462;
2463; GFX1064-LABEL: sub_i64_uniform:
2464; GFX1064:       ; %bb.0: ; %entry
2465; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2466; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
2467; GFX1064-NEXT:    ; implicit-def: $vgpr1_vgpr2
2468; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
2469; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s7, v0
2470; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2471; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2472; GFX1064-NEXT:    s_cbranch_execz BB12_2
2473; GFX1064-NEXT:  ; %bb.1:
2474; GFX1064-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
2475; GFX1064-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2476; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2477; GFX1064-NEXT:    s_mul_i32 s7, s3, s6
2478; GFX1064-NEXT:    s_mul_hi_u32 s8, s2, s6
2479; GFX1064-NEXT:    s_mul_i32 s6, s2, s6
2480; GFX1064-NEXT:    s_add_i32 s8, s8, s7
2481; GFX1064-NEXT:    v_mov_b32_e32 v1, s6
2482; GFX1064-NEXT:    v_mov_b32_e32 v2, s8
2483; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2484; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2485; GFX1064-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2486; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2487; GFX1064-NEXT:    buffer_gl0_inv
2488; GFX1064-NEXT:    buffer_gl1_inv
2489; GFX1064-NEXT:  BB12_2:
2490; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
2491; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
2492; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2493; GFX1064-NEXT:    v_mul_lo_u32 v3, s3, v0
2494; GFX1064-NEXT:    v_mul_hi_u32 v4, s2, v0
2495; GFX1064-NEXT:    v_mul_lo_u32 v0, s2, v0
2496; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
2497; GFX1064-NEXT:    v_readfirstlane_b32 s4, v2
2498; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
2499; GFX1064-NEXT:    v_add_nc_u32_e32 v1, v4, v3
2500; GFX1064-NEXT:    v_sub_co_u32_e64 v0, vcc, s2, v0
2501; GFX1064-NEXT:    s_mov_b32 s2, -1
2502; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc
2503; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2504; GFX1064-NEXT:    s_endpgm
2505;
2506; GFX1032-LABEL: sub_i64_uniform:
2507; GFX1032:       ; %bb.0: ; %entry
2508; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2509; GFX1032-NEXT:    s_mov_b32 s5, exec_lo
2510; GFX1032-NEXT:    ; implicit-def: $vcc_hi
2511; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s5, 0
2512; GFX1032-NEXT:    ; implicit-def: $vgpr1_vgpr2
2513; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
2514; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
2515; GFX1032-NEXT:    s_cbranch_execz BB12_2
2516; GFX1032-NEXT:  ; %bb.1:
2517; GFX1032-NEXT:    s_bcnt1_i32_b32 s5, s5
2518; GFX1032-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2519; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2520; GFX1032-NEXT:    s_mul_i32 s6, s3, s5
2521; GFX1032-NEXT:    s_mul_hi_u32 s7, s2, s5
2522; GFX1032-NEXT:    s_mul_i32 s5, s2, s5
2523; GFX1032-NEXT:    s_add_i32 s7, s7, s6
2524; GFX1032-NEXT:    v_mov_b32_e32 v1, s5
2525; GFX1032-NEXT:    v_mov_b32_e32 v2, s7
2526; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2527; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2528; GFX1032-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2529; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2530; GFX1032-NEXT:    buffer_gl0_inv
2531; GFX1032-NEXT:    buffer_gl1_inv
2532; GFX1032-NEXT:  BB12_2:
2533; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
2534; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
2535; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2536; GFX1032-NEXT:    v_mul_lo_u32 v3, s3, v0
2537; GFX1032-NEXT:    v_mul_hi_u32 v4, s2, v0
2538; GFX1032-NEXT:    v_mul_lo_u32 v0, s2, v0
2539; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
2540; GFX1032-NEXT:    v_readfirstlane_b32 s4, v2
2541; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
2542; GFX1032-NEXT:    v_add_nc_u32_e32 v1, v4, v3
2543; GFX1032-NEXT:    v_sub_co_u32_e64 v0, vcc_lo, s2, v0
2544; GFX1032-NEXT:    s_mov_b32 s2, -1
2545; GFX1032-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo
2546; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2547; GFX1032-NEXT:    s_endpgm
2548entry:
2549  %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %subitive acq_rel
2550  store i64 %old, i64 addrspace(1)* %out
2551  ret void
2552}
2553
2554; GCN-NOT: v_mbcnt_lo_u32_b32
2555; GCN-NOT: v_mbcnt_hi_u32_b32
2556; GCN-NOT: s_bcnt1_i32_b64
2557define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out) {
2558;
2559;
2560; GFX7LESS-LABEL: sub_i64_varying:
2561; GFX7LESS:       ; %bb.0: ; %entry
2562; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2563; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
2564; GFX7LESS-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
2565; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2566; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2567; GFX7LESS-NEXT:    ds_sub_rtn_u64 v[0:1], v2, v[0:1]
2568; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2569; GFX7LESS-NEXT:    buffer_wbinvl1
2570; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
2571; GFX7LESS-NEXT:    s_mov_b32 s2, -1
2572; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2573; GFX7LESS-NEXT:    s_endpgm
2574;
2575; GFX8-LABEL: sub_i64_varying:
2576; GFX8:       ; %bb.0: ; %entry
2577; GFX8-NEXT:    v_mov_b32_e32 v1, 0
2578; GFX8-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
2579; GFX8-NEXT:    s_mov_b32 m0, -1
2580; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2581; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2582; GFX8-NEXT:    ds_sub_rtn_u64 v[0:1], v2, v[0:1]
2583; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2584; GFX8-NEXT:    buffer_wbinvl1_vol
2585; GFX8-NEXT:    s_mov_b32 s3, 0xf000
2586; GFX8-NEXT:    s_mov_b32 s2, -1
2587; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2588; GFX8-NEXT:    s_endpgm
2589;
2590; GFX9-LABEL: sub_i64_varying:
2591; GFX9:       ; %bb.0: ; %entry
2592; GFX9-NEXT:    v_mov_b32_e32 v1, 0
2593; GFX9-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
2594; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2595; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2596; GFX9-NEXT:    ds_sub_rtn_u64 v[0:1], v2, v[0:1]
2597; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2598; GFX9-NEXT:    buffer_wbinvl1_vol
2599; GFX9-NEXT:    s_mov_b32 s3, 0xf000
2600; GFX9-NEXT:    s_mov_b32 s2, -1
2601; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2602; GFX9-NEXT:    s_endpgm
2603;
2604; GFX1064-LABEL: sub_i64_varying:
2605; GFX1064:       ; %bb.0: ; %entry
2606; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
2607; GFX1064-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
2608; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2609; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
2610; GFX1064-NEXT:    s_mov_b32 s2, -1
2611; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2612; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2613; GFX1064-NEXT:    ds_sub_rtn_u64 v[0:1], v2, v[0:1]
2614; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2615; GFX1064-NEXT:    buffer_gl0_inv
2616; GFX1064-NEXT:    buffer_gl1_inv
2617; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2618; GFX1064-NEXT:    s_endpgm
2619;
2620; GFX1032-LABEL: sub_i64_varying:
2621; GFX1032:       ; %bb.0: ; %entry
2622; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
2623; GFX1032-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
2624; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2625; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
2626; GFX1032-NEXT:    s_mov_b32 s2, -1
2627; GFX1032-NEXT:    ; implicit-def: $vcc_hi
2628; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2629; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2630; GFX1032-NEXT:    ds_sub_rtn_u64 v[0:1], v2, v[0:1]
2631; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2632; GFX1032-NEXT:    buffer_gl0_inv
2633; GFX1032-NEXT:    buffer_gl1_inv
2634; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2635; GFX1032-NEXT:    s_endpgm
2636entry:
2637  %lane = call i32 @llvm.amdgcn.workitem.id.x()
2638  %zext = zext i32 %lane to i64
2639  %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %zext acq_rel
2640  store i64 %old, i64 addrspace(1)* %out
2641  ret void
2642}
2643
2644define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) {
2645;
2646;
2647; GFX7LESS-LABEL: and_i32_varying:
2648; GFX7LESS:       ; %bb.0: ; %entry
2649; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2650; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
2651; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2652; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2653; GFX7LESS-NEXT:    ds_and_rtn_b32 v0, v1, v0
2654; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2655; GFX7LESS-NEXT:    buffer_wbinvl1
2656; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
2657; GFX7LESS-NEXT:    s_mov_b32 s2, -1
2658; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2659; GFX7LESS-NEXT:    s_endpgm
2660;
2661; GFX8-LABEL: and_i32_varying:
2662; GFX8:       ; %bb.0: ; %entry
2663; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2664; GFX8-NEXT:    v_mov_b32_e32 v2, v0
2665; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2666; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2667; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
2668; GFX8-NEXT:    v_mov_b32_e32 v1, -1
2669; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
2670; GFX8-NEXT:    s_not_b64 exec, exec
2671; GFX8-NEXT:    v_mov_b32_e32 v2, -1
2672; GFX8-NEXT:    s_not_b64 exec, exec
2673; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
2674; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
2675; GFX8-NEXT:    s_nop 1
2676; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
2677; GFX8-NEXT:    s_nop 1
2678; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
2679; GFX8-NEXT:    s_nop 1
2680; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
2681; GFX8-NEXT:    s_nop 1
2682; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
2683; GFX8-NEXT:    s_nop 1
2684; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
2685; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
2686; GFX8-NEXT:    s_nop 0
2687; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
2688; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
2689; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2690; GFX8-NEXT:    ; implicit-def: $vgpr0
2691; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2692; GFX8-NEXT:    s_cbranch_execz BB14_2
2693; GFX8-NEXT:  ; %bb.1:
2694; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
2695; GFX8-NEXT:    v_mov_b32_e32 v3, s4
2696; GFX8-NEXT:    s_mov_b32 m0, -1
2697; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2698; GFX8-NEXT:    ds_and_rtn_b32 v0, v0, v3
2699; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2700; GFX8-NEXT:    buffer_wbinvl1_vol
2701; GFX8-NEXT:  BB14_2:
2702; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
2703; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
2704; GFX8-NEXT:    v_mov_b32_e32 v0, v1
2705; GFX8-NEXT:    v_and_b32_e32 v0, s2, v0
2706; GFX8-NEXT:    s_mov_b32 s3, 0xf000
2707; GFX8-NEXT:    s_mov_b32 s2, -1
2708; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2709; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2710; GFX8-NEXT:    s_endpgm
2711;
2712; GFX9-LABEL: and_i32_varying:
2713; GFX9:       ; %bb.0: ; %entry
2714; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2715; GFX9-NEXT:    v_mov_b32_e32 v2, v0
2716; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2717; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2718; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
2719; GFX9-NEXT:    v_mov_b32_e32 v1, -1
2720; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
2721; GFX9-NEXT:    s_not_b64 exec, exec
2722; GFX9-NEXT:    v_mov_b32_e32 v2, -1
2723; GFX9-NEXT:    s_not_b64 exec, exec
2724; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
2725; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
2726; GFX9-NEXT:    s_nop 1
2727; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
2728; GFX9-NEXT:    s_nop 1
2729; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
2730; GFX9-NEXT:    s_nop 1
2731; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
2732; GFX9-NEXT:    s_nop 1
2733; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
2734; GFX9-NEXT:    s_nop 1
2735; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
2736; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
2737; GFX9-NEXT:    s_nop 0
2738; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
2739; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
2740; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2741; GFX9-NEXT:    ; implicit-def: $vgpr0
2742; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2743; GFX9-NEXT:    s_cbranch_execz BB14_2
2744; GFX9-NEXT:  ; %bb.1:
2745; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
2746; GFX9-NEXT:    v_mov_b32_e32 v3, s4
2747; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2748; GFX9-NEXT:    ds_and_rtn_b32 v0, v0, v3
2749; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2750; GFX9-NEXT:    buffer_wbinvl1_vol
2751; GFX9-NEXT:  BB14_2:
2752; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
2753; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
2754; GFX9-NEXT:    v_mov_b32_e32 v0, v1
2755; GFX9-NEXT:    v_and_b32_e32 v0, s2, v0
2756; GFX9-NEXT:    s_mov_b32 s3, 0xf000
2757; GFX9-NEXT:    s_mov_b32 s2, -1
2758; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2759; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2760; GFX9-NEXT:    s_endpgm
2761;
2762; GFX1064-LABEL: and_i32_varying:
2763; GFX1064:       ; %bb.0: ; %entry
2764; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
2765; GFX1064-NEXT:    s_not_b64 exec, exec
2766; GFX1064-NEXT:    v_mov_b32_e32 v1, -1
2767; GFX1064-NEXT:    s_not_b64 exec, exec
2768; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2769; GFX1064-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2770; GFX1064-NEXT:    v_mov_b32_e32 v3, -1
2771; GFX1064-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
2772; GFX1064-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
2773; GFX1064-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
2774; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
2775; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2776; GFX1064-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2777; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
2778; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
2779; GFX1064-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
2780; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
2781; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2782; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2783; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2784; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2785; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
2786; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
2787; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2788; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
2789; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2790; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
2791; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
2792; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
2793; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2794; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
2795; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
2796; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
2797; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
2798; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2799; GFX1064-NEXT:    s_mov_b32 s2, -1
2800; GFX1064-NEXT:    ; implicit-def: $vgpr0
2801; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2802; GFX1064-NEXT:    s_cbranch_execz BB14_2
2803; GFX1064-NEXT:  ; %bb.1:
2804; GFX1064-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
2805; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
2806; GFX1064-NEXT:    s_mov_b32 s3, s7
2807; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2808; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2809; GFX1064-NEXT:    ds_and_rtn_b32 v0, v7, v4
2810; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2811; GFX1064-NEXT:    buffer_gl0_inv
2812; GFX1064-NEXT:    buffer_gl1_inv
2813; GFX1064-NEXT:  BB14_2:
2814; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
2815; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
2816; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
2817; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
2818; GFX1064-NEXT:    v_and_b32_e32 v0, s3, v0
2819; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
2820; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2821; GFX1064-NEXT:    s_nop 0
2822; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2823; GFX1064-NEXT:    s_endpgm
2824;
2825; GFX1032-LABEL: and_i32_varying:
2826; GFX1032:       ; %bb.0: ; %entry
2827; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
2828; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2829; GFX1032-NEXT:    v_mov_b32_e32 v1, -1
2830; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2831; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
2832; GFX1032-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2833; GFX1032-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
2834; GFX1032-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
2835; GFX1032-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
2836; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
2837; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2838; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
2839; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2840; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
2841; GFX1032-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2842; GFX1032-NEXT:    v_mov_b32_e32 v3, -1
2843; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
2844; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
2845; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2846; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
2847; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
2848; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
2849; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
2850; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
2851; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
2852; GFX1032-NEXT:    s_mov_b32 s2, -1
2853; GFX1032-NEXT:    ; implicit-def: $vgpr0
2854; GFX1032-NEXT:    ; implicit-def: $vcc_hi
2855; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
2856; GFX1032-NEXT:    s_cbranch_execz BB14_2
2857; GFX1032-NEXT:  ; %bb.1:
2858; GFX1032-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
2859; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
2860; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2861; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2862; GFX1032-NEXT:    ds_and_rtn_b32 v0, v7, v4
2863; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2864; GFX1032-NEXT:    buffer_gl0_inv
2865; GFX1032-NEXT:    buffer_gl1_inv
2866; GFX1032-NEXT:  BB14_2:
2867; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
2868; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
2869; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
2870; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
2871; GFX1032-NEXT:    v_and_b32_e32 v0, s3, v0
2872; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
2873; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2874; GFX1032-NEXT:    s_nop 0
2875; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2876; GFX1032-NEXT:    s_endpgm
2877entry:
2878  %lane = call i32 @llvm.amdgcn.workitem.id.x()
2879  %old = atomicrmw and i32 addrspace(3)* @local_var32, i32 %lane acq_rel
2880  store i32 %old, i32 addrspace(1)* %out
2881  ret void
2882}
2883
2884define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
2885;
2886;
2887; GFX7LESS-LABEL: or_i32_varying:
2888; GFX7LESS:       ; %bb.0: ; %entry
2889; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2890; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
2891; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2892; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2893; GFX7LESS-NEXT:    ds_or_rtn_b32 v0, v1, v0
2894; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2895; GFX7LESS-NEXT:    buffer_wbinvl1
2896; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
2897; GFX7LESS-NEXT:    s_mov_b32 s2, -1
2898; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2899; GFX7LESS-NEXT:    s_endpgm
2900;
2901; GFX8-LABEL: or_i32_varying:
2902; GFX8:       ; %bb.0: ; %entry
2903; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2904; GFX8-NEXT:    v_mov_b32_e32 v2, v0
2905; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
2906; GFX8-NEXT:    v_mov_b32_e32 v1, 0
2907; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
2908; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2909; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2910; GFX8-NEXT:    s_not_b64 exec, exec
2911; GFX8-NEXT:    v_mov_b32_e32 v2, 0
2912; GFX8-NEXT:    s_not_b64 exec, exec
2913; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
2914; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
2915; GFX8-NEXT:    s_nop 1
2916; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
2917; GFX8-NEXT:    s_nop 1
2918; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
2919; GFX8-NEXT:    s_nop 1
2920; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
2921; GFX8-NEXT:    s_nop 1
2922; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
2923; GFX8-NEXT:    s_nop 1
2924; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
2925; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
2926; GFX8-NEXT:    s_nop 0
2927; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
2928; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
2929; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2930; GFX8-NEXT:    ; implicit-def: $vgpr0
2931; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2932; GFX8-NEXT:    s_cbranch_execz BB15_2
2933; GFX8-NEXT:  ; %bb.1:
2934; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
2935; GFX8-NEXT:    v_mov_b32_e32 v3, s4
2936; GFX8-NEXT:    s_mov_b32 m0, -1
2937; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2938; GFX8-NEXT:    ds_or_rtn_b32 v0, v0, v3
2939; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2940; GFX8-NEXT:    buffer_wbinvl1_vol
2941; GFX8-NEXT:  BB15_2:
2942; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
2943; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
2944; GFX8-NEXT:    v_mov_b32_e32 v0, v1
2945; GFX8-NEXT:    v_or_b32_e32 v0, s2, v0
2946; GFX8-NEXT:    s_mov_b32 s3, 0xf000
2947; GFX8-NEXT:    s_mov_b32 s2, -1
2948; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2949; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2950; GFX8-NEXT:    s_endpgm
2951;
2952; GFX9-LABEL: or_i32_varying:
2953; GFX9:       ; %bb.0: ; %entry
2954; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2955; GFX9-NEXT:    v_mov_b32_e32 v2, v0
2956; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
2957; GFX9-NEXT:    v_mov_b32_e32 v1, 0
2958; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
2959; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2960; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2961; GFX9-NEXT:    s_not_b64 exec, exec
2962; GFX9-NEXT:    v_mov_b32_e32 v2, 0
2963; GFX9-NEXT:    s_not_b64 exec, exec
2964; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
2965; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
2966; GFX9-NEXT:    s_nop 1
2967; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
2968; GFX9-NEXT:    s_nop 1
2969; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
2970; GFX9-NEXT:    s_nop 1
2971; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
2972; GFX9-NEXT:    s_nop 1
2973; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
2974; GFX9-NEXT:    s_nop 1
2975; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
2976; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
2977; GFX9-NEXT:    s_nop 0
2978; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
2979; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
2980; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2981; GFX9-NEXT:    ; implicit-def: $vgpr0
2982; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2983; GFX9-NEXT:    s_cbranch_execz BB15_2
2984; GFX9-NEXT:  ; %bb.1:
2985; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
2986; GFX9-NEXT:    v_mov_b32_e32 v3, s4
2987; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2988; GFX9-NEXT:    ds_or_rtn_b32 v0, v0, v3
2989; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2990; GFX9-NEXT:    buffer_wbinvl1_vol
2991; GFX9-NEXT:  BB15_2:
2992; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
2993; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
2994; GFX9-NEXT:    v_mov_b32_e32 v0, v1
2995; GFX9-NEXT:    v_or_b32_e32 v0, s2, v0
2996; GFX9-NEXT:    s_mov_b32 s3, 0xf000
2997; GFX9-NEXT:    s_mov_b32 s2, -1
2998; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2999; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3000; GFX9-NEXT:    s_endpgm
3001;
3002; GFX1064-LABEL: or_i32_varying:
3003; GFX1064:       ; %bb.0: ; %entry
3004; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
3005; GFX1064-NEXT:    s_not_b64 exec, exec
3006; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
3007; GFX1064-NEXT:    s_not_b64 exec, exec
3008; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3009; GFX1064-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
3010; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
3011; GFX1064-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
3012; GFX1064-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
3013; GFX1064-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
3014; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
3015; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
3016; GFX1064-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3017; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
3018; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
3019; GFX1064-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
3020; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
3021; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3022; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3023; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3024; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3025; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
3026; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
3027; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3028; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
3029; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3030; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
3031; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
3032; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
3033; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3034; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
3035; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
3036; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
3037; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
3038; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3039; GFX1064-NEXT:    s_mov_b32 s2, -1
3040; GFX1064-NEXT:    ; implicit-def: $vgpr0
3041; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3042; GFX1064-NEXT:    s_cbranch_execz BB15_2
3043; GFX1064-NEXT:  ; %bb.1:
3044; GFX1064-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
3045; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
3046; GFX1064-NEXT:    s_mov_b32 s3, s7
3047; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3048; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
3049; GFX1064-NEXT:    ds_or_rtn_b32 v0, v7, v4
3050; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3051; GFX1064-NEXT:    buffer_gl0_inv
3052; GFX1064-NEXT:    buffer_gl1_inv
3053; GFX1064-NEXT:  BB15_2:
3054; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
3055; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
3056; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
3057; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
3058; GFX1064-NEXT:    v_or_b32_e32 v0, s3, v0
3059; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
3060; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3061; GFX1064-NEXT:    s_nop 0
3062; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3063; GFX1064-NEXT:    s_endpgm
3064;
3065; GFX1032-LABEL: or_i32_varying:
3066; GFX1032:       ; %bb.0: ; %entry
3067; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
3068; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3069; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
3070; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3071; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3072; GFX1032-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
3073; GFX1032-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
3074; GFX1032-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
3075; GFX1032-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
3076; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
3077; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
3078; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3079; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3080; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3081; GFX1032-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3082; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
3083; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
3084; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
3085; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3086; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3087; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
3088; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3089; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
3090; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3091; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
3092; GFX1032-NEXT:    s_mov_b32 s2, -1
3093; GFX1032-NEXT:    ; implicit-def: $vgpr0
3094; GFX1032-NEXT:    ; implicit-def: $vcc_hi
3095; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
3096; GFX1032-NEXT:    s_cbranch_execz BB15_2
3097; GFX1032-NEXT:  ; %bb.1:
3098; GFX1032-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
3099; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
3100; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3101; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
3102; GFX1032-NEXT:    ds_or_rtn_b32 v0, v7, v4
3103; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3104; GFX1032-NEXT:    buffer_gl0_inv
3105; GFX1032-NEXT:    buffer_gl1_inv
3106; GFX1032-NEXT:  BB15_2:
3107; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
3108; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
3109; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
3110; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
3111; GFX1032-NEXT:    v_or_b32_e32 v0, s3, v0
3112; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
3113; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3114; GFX1032-NEXT:    s_nop 0
3115; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3116; GFX1032-NEXT:    s_endpgm
3117entry:
3118  %lane = call i32 @llvm.amdgcn.workitem.id.x()
3119  %old = atomicrmw or i32 addrspace(3)* @local_var32, i32 %lane acq_rel
3120  store i32 %old, i32 addrspace(1)* %out
3121  ret void
3122}
3123
3124define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
3125;
3126;
3127; GFX7LESS-LABEL: xor_i32_varying:
3128; GFX7LESS:       ; %bb.0: ; %entry
3129; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3130; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
3131; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3132; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3133; GFX7LESS-NEXT:    ds_xor_rtn_b32 v0, v1, v0
3134; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3135; GFX7LESS-NEXT:    buffer_wbinvl1
3136; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3137; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3138; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3139; GFX7LESS-NEXT:    s_endpgm
3140;
3141; GFX8-LABEL: xor_i32_varying:
3142; GFX8:       ; %bb.0: ; %entry
3143; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3144; GFX8-NEXT:    v_mov_b32_e32 v2, v0
3145; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3146; GFX8-NEXT:    v_mov_b32_e32 v1, 0
3147; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3148; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3149; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3150; GFX8-NEXT:    s_not_b64 exec, exec
3151; GFX8-NEXT:    v_mov_b32_e32 v2, 0
3152; GFX8-NEXT:    s_not_b64 exec, exec
3153; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3154; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
3155; GFX8-NEXT:    s_nop 1
3156; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
3157; GFX8-NEXT:    s_nop 1
3158; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
3159; GFX8-NEXT:    s_nop 1
3160; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
3161; GFX8-NEXT:    s_nop 1
3162; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3163; GFX8-NEXT:    s_nop 1
3164; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3165; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
3166; GFX8-NEXT:    s_nop 0
3167; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3168; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3169; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3170; GFX8-NEXT:    ; implicit-def: $vgpr0
3171; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3172; GFX8-NEXT:    s_cbranch_execz BB16_2
3173; GFX8-NEXT:  ; %bb.1:
3174; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3175; GFX8-NEXT:    v_mov_b32_e32 v3, s4
3176; GFX8-NEXT:    s_mov_b32 m0, -1
3177; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3178; GFX8-NEXT:    ds_xor_rtn_b32 v0, v0, v3
3179; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3180; GFX8-NEXT:    buffer_wbinvl1_vol
3181; GFX8-NEXT:  BB16_2:
3182; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
3183; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
3184; GFX8-NEXT:    v_mov_b32_e32 v0, v1
3185; GFX8-NEXT:    v_xor_b32_e32 v0, s2, v0
3186; GFX8-NEXT:    s_mov_b32 s3, 0xf000
3187; GFX8-NEXT:    s_mov_b32 s2, -1
3188; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3189; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3190; GFX8-NEXT:    s_endpgm
3191;
3192; GFX9-LABEL: xor_i32_varying:
3193; GFX9:       ; %bb.0: ; %entry
3194; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3195; GFX9-NEXT:    v_mov_b32_e32 v2, v0
3196; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3197; GFX9-NEXT:    v_mov_b32_e32 v1, 0
3198; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3199; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3200; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3201; GFX9-NEXT:    s_not_b64 exec, exec
3202; GFX9-NEXT:    v_mov_b32_e32 v2, 0
3203; GFX9-NEXT:    s_not_b64 exec, exec
3204; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3205; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
3206; GFX9-NEXT:    s_nop 1
3207; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
3208; GFX9-NEXT:    s_nop 1
3209; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
3210; GFX9-NEXT:    s_nop 1
3211; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
3212; GFX9-NEXT:    s_nop 1
3213; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3214; GFX9-NEXT:    s_nop 1
3215; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3216; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
3217; GFX9-NEXT:    s_nop 0
3218; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3219; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3220; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3221; GFX9-NEXT:    ; implicit-def: $vgpr0
3222; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3223; GFX9-NEXT:    s_cbranch_execz BB16_2
3224; GFX9-NEXT:  ; %bb.1:
3225; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3226; GFX9-NEXT:    v_mov_b32_e32 v3, s4
3227; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3228; GFX9-NEXT:    ds_xor_rtn_b32 v0, v0, v3
3229; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3230; GFX9-NEXT:    buffer_wbinvl1_vol
3231; GFX9-NEXT:  BB16_2:
3232; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
3233; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
3234; GFX9-NEXT:    v_mov_b32_e32 v0, v1
3235; GFX9-NEXT:    v_xor_b32_e32 v0, s2, v0
3236; GFX9-NEXT:    s_mov_b32 s3, 0xf000
3237; GFX9-NEXT:    s_mov_b32 s2, -1
3238; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3239; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3240; GFX9-NEXT:    s_endpgm
3241;
3242; GFX1064-LABEL: xor_i32_varying:
3243; GFX1064:       ; %bb.0: ; %entry
3244; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
3245; GFX1064-NEXT:    s_not_b64 exec, exec
3246; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
3247; GFX1064-NEXT:    s_not_b64 exec, exec
3248; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3249; GFX1064-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
3250; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
3251; GFX1064-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
3252; GFX1064-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
3253; GFX1064-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
3254; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
3255; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
3256; GFX1064-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3257; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
3258; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
3259; GFX1064-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
3260; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
3261; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3262; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3263; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3264; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3265; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
3266; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
3267; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3268; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
3269; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3270; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
3271; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
3272; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
3273; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3274; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
3275; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
3276; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
3277; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
3278; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3279; GFX1064-NEXT:    s_mov_b32 s2, -1
3280; GFX1064-NEXT:    ; implicit-def: $vgpr0
3281; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3282; GFX1064-NEXT:    s_cbranch_execz BB16_2
3283; GFX1064-NEXT:  ; %bb.1:
3284; GFX1064-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
3285; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
3286; GFX1064-NEXT:    s_mov_b32 s3, s7
3287; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3288; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
3289; GFX1064-NEXT:    ds_xor_rtn_b32 v0, v7, v4
3290; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3291; GFX1064-NEXT:    buffer_gl0_inv
3292; GFX1064-NEXT:    buffer_gl1_inv
3293; GFX1064-NEXT:  BB16_2:
3294; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
3295; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
3296; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
3297; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
3298; GFX1064-NEXT:    v_xor_b32_e32 v0, s3, v0
3299; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
3300; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3301; GFX1064-NEXT:    s_nop 0
3302; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3303; GFX1064-NEXT:    s_endpgm
3304;
3305; GFX1032-LABEL: xor_i32_varying:
3306; GFX1032:       ; %bb.0: ; %entry
3307; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
3308; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3309; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
3310; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3311; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3312; GFX1032-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
3313; GFX1032-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
3314; GFX1032-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
3315; GFX1032-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
3316; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
3317; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
3318; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3319; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3320; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3321; GFX1032-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3322; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
3323; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
3324; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
3325; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3326; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3327; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
3328; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3329; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
3330; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3331; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
3332; GFX1032-NEXT:    s_mov_b32 s2, -1
3333; GFX1032-NEXT:    ; implicit-def: $vgpr0
3334; GFX1032-NEXT:    ; implicit-def: $vcc_hi
3335; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
3336; GFX1032-NEXT:    s_cbranch_execz BB16_2
3337; GFX1032-NEXT:  ; %bb.1:
3338; GFX1032-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
3339; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
3340; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3341; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
3342; GFX1032-NEXT:    ds_xor_rtn_b32 v0, v7, v4
3343; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3344; GFX1032-NEXT:    buffer_gl0_inv
3345; GFX1032-NEXT:    buffer_gl1_inv
3346; GFX1032-NEXT:  BB16_2:
3347; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
3348; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
3349; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
3350; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
3351; GFX1032-NEXT:    v_xor_b32_e32 v0, s3, v0
3352; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
3353; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3354; GFX1032-NEXT:    s_nop 0
3355; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3356; GFX1032-NEXT:    s_endpgm
3357entry:
3358  %lane = call i32 @llvm.amdgcn.workitem.id.x()
3359  %old = atomicrmw xor i32 addrspace(3)* @local_var32, i32 %lane acq_rel
3360  store i32 %old, i32 addrspace(1)* %out
3361  ret void
3362}
3363
3364define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
3365;
3366;
3367; GFX7LESS-LABEL: max_i32_varying:
3368; GFX7LESS:       ; %bb.0: ; %entry
3369; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3370; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
3371; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3372; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3373; GFX7LESS-NEXT:    ds_max_rtn_i32 v0, v1, v0
3374; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3375; GFX7LESS-NEXT:    buffer_wbinvl1
3376; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3377; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3378; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3379; GFX7LESS-NEXT:    s_endpgm
3380;
3381; GFX8-LABEL: max_i32_varying:
3382; GFX8:       ; %bb.0: ; %entry
3383; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3384; GFX8-NEXT:    v_mov_b32_e32 v2, v0
3385; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3386; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3387; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3388; GFX8-NEXT:    v_bfrev_b32_e32 v1, 1
3389; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3390; GFX8-NEXT:    s_not_b64 exec, exec
3391; GFX8-NEXT:    v_mov_b32_e32 v2, v1
3392; GFX8-NEXT:    s_not_b64 exec, exec
3393; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3394; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3395; GFX8-NEXT:    s_nop 1
3396; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3397; GFX8-NEXT:    s_nop 1
3398; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3399; GFX8-NEXT:    s_nop 1
3400; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3401; GFX8-NEXT:    s_nop 1
3402; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3403; GFX8-NEXT:    s_nop 1
3404; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3405; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
3406; GFX8-NEXT:    s_nop 0
3407; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3408; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3409; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3410; GFX8-NEXT:    ; implicit-def: $vgpr0
3411; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3412; GFX8-NEXT:    s_cbranch_execz BB17_2
3413; GFX8-NEXT:  ; %bb.1:
3414; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3415; GFX8-NEXT:    v_mov_b32_e32 v3, s4
3416; GFX8-NEXT:    s_mov_b32 m0, -1
3417; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3418; GFX8-NEXT:    ds_max_rtn_i32 v0, v0, v3
3419; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3420; GFX8-NEXT:    buffer_wbinvl1_vol
3421; GFX8-NEXT:  BB17_2:
3422; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
3423; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
3424; GFX8-NEXT:    v_mov_b32_e32 v0, v1
3425; GFX8-NEXT:    v_max_i32_e32 v0, s2, v0
3426; GFX8-NEXT:    s_mov_b32 s3, 0xf000
3427; GFX8-NEXT:    s_mov_b32 s2, -1
3428; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3429; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3430; GFX8-NEXT:    s_endpgm
3431;
3432; GFX9-LABEL: max_i32_varying:
3433; GFX9:       ; %bb.0: ; %entry
3434; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3435; GFX9-NEXT:    v_mov_b32_e32 v2, v0
3436; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3437; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3438; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3439; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
3440; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3441; GFX9-NEXT:    s_not_b64 exec, exec
3442; GFX9-NEXT:    v_mov_b32_e32 v2, v1
3443; GFX9-NEXT:    s_not_b64 exec, exec
3444; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3445; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3446; GFX9-NEXT:    s_nop 1
3447; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3448; GFX9-NEXT:    s_nop 1
3449; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3450; GFX9-NEXT:    s_nop 1
3451; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3452; GFX9-NEXT:    s_nop 1
3453; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3454; GFX9-NEXT:    s_nop 1
3455; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3456; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
3457; GFX9-NEXT:    s_nop 0
3458; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3459; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3460; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3461; GFX9-NEXT:    ; implicit-def: $vgpr0
3462; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3463; GFX9-NEXT:    s_cbranch_execz BB17_2
3464; GFX9-NEXT:  ; %bb.1:
3465; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3466; GFX9-NEXT:    v_mov_b32_e32 v3, s4
3467; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3468; GFX9-NEXT:    ds_max_rtn_i32 v0, v0, v3
3469; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3470; GFX9-NEXT:    buffer_wbinvl1_vol
3471; GFX9-NEXT:  BB17_2:
3472; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
3473; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
3474; GFX9-NEXT:    v_mov_b32_e32 v0, v1
3475; GFX9-NEXT:    v_max_i32_e32 v0, s2, v0
3476; GFX9-NEXT:    s_mov_b32 s3, 0xf000
3477; GFX9-NEXT:    s_mov_b32 s2, -1
3478; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3479; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3480; GFX9-NEXT:    s_endpgm
3481;
3482; GFX1064-LABEL: max_i32_varying:
3483; GFX1064:       ; %bb.0: ; %entry
3484; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
3485; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3486; GFX1064-NEXT:    v_bfrev_b32_e32 v1, 1
3487; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3488; GFX1064-NEXT:    s_not_b64 exec, exec
3489; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
3490; GFX1064-NEXT:    s_not_b64 exec, exec
3491; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3492; GFX1064-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3493; GFX1064-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3494; GFX1064-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3495; GFX1064-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3496; GFX1064-NEXT:    v_mov_b32_e32 v3, v2
3497; GFX1064-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
3498; GFX1064-NEXT:    v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3499; GFX1064-NEXT:    v_readlane_b32 s4, v2, 31
3500; GFX1064-NEXT:    v_mov_b32_e32 v3, s4
3501; GFX1064-NEXT:    v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
3502; GFX1064-NEXT:    v_readlane_b32 s4, v2, 15
3503; GFX1064-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3504; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3505; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3506; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3507; GFX1064-NEXT:    v_readlane_b32 s5, v2, 31
3508; GFX1064-NEXT:    v_writelane_b32 v1, s4, 16
3509; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3510; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
3511; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3512; GFX1064-NEXT:    v_readlane_b32 s7, v2, 63
3513; GFX1064-NEXT:    v_readlane_b32 s6, v2, 47
3514; GFX1064-NEXT:    v_writelane_b32 v1, s5, 32
3515; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3516; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
3517; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
3518; GFX1064-NEXT:    v_writelane_b32 v1, s6, 48
3519; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
3520; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3521; GFX1064-NEXT:    s_mov_b32 s2, -1
3522; GFX1064-NEXT:    ; implicit-def: $vgpr0
3523; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3524; GFX1064-NEXT:    s_cbranch_execz BB17_2
3525; GFX1064-NEXT:  ; %bb.1:
3526; GFX1064-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
3527; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
3528; GFX1064-NEXT:    s_mov_b32 s3, s7
3529; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3530; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
3531; GFX1064-NEXT:    ds_max_rtn_i32 v0, v7, v4
3532; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3533; GFX1064-NEXT:    buffer_gl0_inv
3534; GFX1064-NEXT:    buffer_gl1_inv
3535; GFX1064-NEXT:  BB17_2:
3536; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
3537; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
3538; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
3539; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
3540; GFX1064-NEXT:    v_max_i32_e32 v0, s3, v0
3541; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
3542; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3543; GFX1064-NEXT:    s_nop 0
3544; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3545; GFX1064-NEXT:    s_endpgm
3546;
3547; GFX1032-LABEL: max_i32_varying:
3548; GFX1032:       ; %bb.0: ; %entry
3549; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
3550; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3551; GFX1032-NEXT:    v_bfrev_b32_e32 v1, 1
3552; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3553; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3554; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
3555; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3556; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3557; GFX1032-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3558; GFX1032-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3559; GFX1032-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3560; GFX1032-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3561; GFX1032-NEXT:    v_mov_b32_e32 v3, v2
3562; GFX1032-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
3563; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3564; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3565; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3566; GFX1032-NEXT:    v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3567; GFX1032-NEXT:    v_readlane_b32 s3, v2, 15
3568; GFX1032-NEXT:    v_readlane_b32 s4, v2, 31
3569; GFX1032-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3570; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3571; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
3572; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3573; GFX1032-NEXT:    v_writelane_b32 v1, s3, 16
3574; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3575; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
3576; GFX1032-NEXT:    s_mov_b32 s2, -1
3577; GFX1032-NEXT:    ; implicit-def: $vgpr0
3578; GFX1032-NEXT:    ; implicit-def: $vcc_hi
3579; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
3580; GFX1032-NEXT:    s_cbranch_execz BB17_2
3581; GFX1032-NEXT:  ; %bb.1:
3582; GFX1032-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
3583; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
3584; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3585; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
3586; GFX1032-NEXT:    ds_max_rtn_i32 v0, v7, v4
3587; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3588; GFX1032-NEXT:    buffer_gl0_inv
3589; GFX1032-NEXT:    buffer_gl1_inv
3590; GFX1032-NEXT:  BB17_2:
3591; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
3592; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
3593; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
3594; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
3595; GFX1032-NEXT:    v_max_i32_e32 v0, s3, v0
3596; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
3597; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3598; GFX1032-NEXT:    s_nop 0
3599; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3600; GFX1032-NEXT:    s_endpgm
3601entry:
3602  %lane = call i32 @llvm.amdgcn.workitem.id.x()
3603  %old = atomicrmw max i32 addrspace(3)* @local_var32, i32 %lane acq_rel
3604  store i32 %old, i32 addrspace(1)* %out
3605  ret void
3606}
3607
3608define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) {
3609;
3610;
3611; GFX7LESS-LABEL: max_i64_constant:
3612; GFX7LESS:       ; %bb.0: ; %entry
3613; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3614; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
3615; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
3616; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3617; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
3618; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3619; GFX7LESS-NEXT:    s_cbranch_execz BB18_2
3620; GFX7LESS-NEXT:  ; %bb.1:
3621; GFX7LESS-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
3622; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
3623; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
3624; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3625; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3626; GFX7LESS-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
3627; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3628; GFX7LESS-NEXT:    buffer_wbinvl1
3629; GFX7LESS-NEXT:  BB18_2:
3630; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
3631; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
3632; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
3633; GFX7LESS-NEXT:    v_bfrev_b32_e32 v1, 1
3634; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
3635; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3636; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
3637; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
3638; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s4
3639; GFX7LESS-NEXT:    v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1]
3640; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
3641; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
3642; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3643; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3644; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3645; GFX7LESS-NEXT:    s_endpgm
3646;
3647; GFX8-LABEL: max_i64_constant:
3648; GFX8:       ; %bb.0: ; %entry
3649; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3650; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3651; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3652; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3653; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
3654; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3655; GFX8-NEXT:    s_cbranch_execz BB18_2
3656; GFX8-NEXT:  ; %bb.1:
3657; GFX8-NEXT:    v_mov_b32_e32 v0, 5
3658; GFX8-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
3659; GFX8-NEXT:    v_mov_b32_e32 v1, 0
3660; GFX8-NEXT:    s_mov_b32 m0, -1
3661; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3662; GFX8-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
3663; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3664; GFX8-NEXT:    buffer_wbinvl1_vol
3665; GFX8-NEXT:  BB18_2:
3666; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
3667; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
3668; GFX8-NEXT:    v_bfrev_b32_e32 v0, 1
3669; GFX8-NEXT:    v_readfirstlane_b32 s3, v1
3670; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
3671; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
3672; GFX8-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
3673; GFX8-NEXT:    v_mov_b32_e32 v2, s3
3674; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
3675; GFX8-NEXT:    v_mov_b32_e32 v2, s2
3676; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
3677; GFX8-NEXT:    s_mov_b32 s3, 0xf000
3678; GFX8-NEXT:    s_mov_b32 s2, -1
3679; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3680; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3681; GFX8-NEXT:    s_endpgm
3682;
3683; GFX9-LABEL: max_i64_constant:
3684; GFX9:       ; %bb.0: ; %entry
3685; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3686; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3687; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3688; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3689; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
3690; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3691; GFX9-NEXT:    s_cbranch_execz BB18_2
3692; GFX9-NEXT:  ; %bb.1:
3693; GFX9-NEXT:    v_mov_b32_e32 v0, 5
3694; GFX9-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
3695; GFX9-NEXT:    v_mov_b32_e32 v1, 0
3696; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3697; GFX9-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
3698; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3699; GFX9-NEXT:    buffer_wbinvl1_vol
3700; GFX9-NEXT:  BB18_2:
3701; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
3702; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
3703; GFX9-NEXT:    v_bfrev_b32_e32 v0, 1
3704; GFX9-NEXT:    v_readfirstlane_b32 s3, v1
3705; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
3706; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
3707; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
3708; GFX9-NEXT:    v_mov_b32_e32 v2, s3
3709; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
3710; GFX9-NEXT:    v_mov_b32_e32 v2, s2
3711; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
3712; GFX9-NEXT:    s_mov_b32 s3, 0xf000
3713; GFX9-NEXT:    s_mov_b32 s2, -1
3714; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3715; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3716; GFX9-NEXT:    s_endpgm
3717;
3718; GFX1064-LABEL: max_i64_constant:
3719; GFX1064:       ; %bb.0: ; %entry
3720; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3721; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
3722; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
3723; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3724; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
3725; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3726; GFX1064-NEXT:    s_cbranch_execz BB18_2
3727; GFX1064-NEXT:  ; %bb.1:
3728; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
3729; GFX1064-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
3730; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
3731; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3732; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
3733; GFX1064-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
3734; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3735; GFX1064-NEXT:    buffer_gl0_inv
3736; GFX1064-NEXT:    buffer_gl1_inv
3737; GFX1064-NEXT:  BB18_2:
3738; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
3739; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
3740; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
3741; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
3742; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, 0x80000000, vcc
3743; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
3744; GFX1064-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
3745; GFX1064-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
3746; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
3747; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
3748; GFX1064-NEXT:    s_mov_b32 s2, -1
3749; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3750; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3751; GFX1064-NEXT:    s_endpgm
3752;
3753; GFX1032-LABEL: max_i64_constant:
3754; GFX1032:       ; %bb.0: ; %entry
3755; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3756; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
3757; GFX1032-NEXT:    ; implicit-def: $vcc_hi
3758; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
3759; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
3760; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
3761; GFX1032-NEXT:    s_cbranch_execz BB18_2
3762; GFX1032-NEXT:  ; %bb.1:
3763; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
3764; GFX1032-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
3765; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
3766; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3767; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
3768; GFX1032-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
3769; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3770; GFX1032-NEXT:    buffer_gl0_inv
3771; GFX1032-NEXT:    buffer_gl1_inv
3772; GFX1032-NEXT:  BB18_2:
3773; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
3774; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
3775; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
3776; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
3777; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo
3778; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc_lo
3779; GFX1032-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1]
3780; GFX1032-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
3781; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
3782; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
3783; GFX1032-NEXT:    s_mov_b32 s2, -1
3784; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3785; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3786; GFX1032-NEXT:    s_endpgm
3787entry:
3788  %old = atomicrmw max i64 addrspace(3)* @local_var64, i64 5 acq_rel
3789  store i64 %old, i64 addrspace(1)* %out
3790  ret void
3791}
3792
3793define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
3794;
3795;
3796; GFX7LESS-LABEL: min_i32_varying:
3797; GFX7LESS:       ; %bb.0: ; %entry
3798; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3799; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
3800; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3801; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3802; GFX7LESS-NEXT:    ds_min_rtn_i32 v0, v1, v0
3803; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3804; GFX7LESS-NEXT:    buffer_wbinvl1
3805; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3806; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3807; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3808; GFX7LESS-NEXT:    s_endpgm
3809;
3810; GFX8-LABEL: min_i32_varying:
3811; GFX8:       ; %bb.0: ; %entry
3812; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3813; GFX8-NEXT:    v_mov_b32_e32 v2, v0
3814; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3815; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3816; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3817; GFX8-NEXT:    v_bfrev_b32_e32 v1, -2
3818; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3819; GFX8-NEXT:    s_not_b64 exec, exec
3820; GFX8-NEXT:    v_mov_b32_e32 v2, v1
3821; GFX8-NEXT:    s_not_b64 exec, exec
3822; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3823; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3824; GFX8-NEXT:    s_nop 1
3825; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3826; GFX8-NEXT:    s_nop 1
3827; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3828; GFX8-NEXT:    s_nop 1
3829; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3830; GFX8-NEXT:    s_nop 1
3831; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3832; GFX8-NEXT:    s_nop 1
3833; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3834; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
3835; GFX8-NEXT:    s_nop 0
3836; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3837; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3838; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3839; GFX8-NEXT:    ; implicit-def: $vgpr0
3840; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3841; GFX8-NEXT:    s_cbranch_execz BB19_2
3842; GFX8-NEXT:  ; %bb.1:
3843; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3844; GFX8-NEXT:    v_mov_b32_e32 v3, s4
3845; GFX8-NEXT:    s_mov_b32 m0, -1
3846; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3847; GFX8-NEXT:    ds_min_rtn_i32 v0, v0, v3
3848; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3849; GFX8-NEXT:    buffer_wbinvl1_vol
3850; GFX8-NEXT:  BB19_2:
3851; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
3852; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
3853; GFX8-NEXT:    v_mov_b32_e32 v0, v1
3854; GFX8-NEXT:    v_min_i32_e32 v0, s2, v0
3855; GFX8-NEXT:    s_mov_b32 s3, 0xf000
3856; GFX8-NEXT:    s_mov_b32 s2, -1
3857; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3858; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3859; GFX8-NEXT:    s_endpgm
3860;
3861; GFX9-LABEL: min_i32_varying:
3862; GFX9:       ; %bb.0: ; %entry
3863; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3864; GFX9-NEXT:    v_mov_b32_e32 v2, v0
3865; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3866; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3867; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3868; GFX9-NEXT:    v_bfrev_b32_e32 v1, -2
3869; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3870; GFX9-NEXT:    s_not_b64 exec, exec
3871; GFX9-NEXT:    v_mov_b32_e32 v2, v1
3872; GFX9-NEXT:    s_not_b64 exec, exec
3873; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3874; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3875; GFX9-NEXT:    s_nop 1
3876; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3877; GFX9-NEXT:    s_nop 1
3878; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3879; GFX9-NEXT:    s_nop 1
3880; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3881; GFX9-NEXT:    s_nop 1
3882; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3883; GFX9-NEXT:    s_nop 1
3884; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3885; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
3886; GFX9-NEXT:    s_nop 0
3887; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3888; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3889; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3890; GFX9-NEXT:    ; implicit-def: $vgpr0
3891; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3892; GFX9-NEXT:    s_cbranch_execz BB19_2
3893; GFX9-NEXT:  ; %bb.1:
3894; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3895; GFX9-NEXT:    v_mov_b32_e32 v3, s4
3896; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3897; GFX9-NEXT:    ds_min_rtn_i32 v0, v0, v3
3898; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3899; GFX9-NEXT:    buffer_wbinvl1_vol
3900; GFX9-NEXT:  BB19_2:
3901; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
3902; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
3903; GFX9-NEXT:    v_mov_b32_e32 v0, v1
3904; GFX9-NEXT:    v_min_i32_e32 v0, s2, v0
3905; GFX9-NEXT:    s_mov_b32 s3, 0xf000
3906; GFX9-NEXT:    s_mov_b32 s2, -1
3907; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3908; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3909; GFX9-NEXT:    s_endpgm
3910;
3911; GFX1064-LABEL: min_i32_varying:
3912; GFX1064:       ; %bb.0: ; %entry
3913; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
3914; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3915; GFX1064-NEXT:    v_bfrev_b32_e32 v1, -2
3916; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3917; GFX1064-NEXT:    s_not_b64 exec, exec
3918; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
3919; GFX1064-NEXT:    s_not_b64 exec, exec
3920; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3921; GFX1064-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3922; GFX1064-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3923; GFX1064-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3924; GFX1064-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3925; GFX1064-NEXT:    v_mov_b32_e32 v3, v2
3926; GFX1064-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
3927; GFX1064-NEXT:    v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3928; GFX1064-NEXT:    v_readlane_b32 s4, v2, 31
3929; GFX1064-NEXT:    v_mov_b32_e32 v3, s4
3930; GFX1064-NEXT:    v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
3931; GFX1064-NEXT:    v_readlane_b32 s4, v2, 15
3932; GFX1064-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3933; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3934; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3935; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3936; GFX1064-NEXT:    v_readlane_b32 s5, v2, 31
3937; GFX1064-NEXT:    v_writelane_b32 v1, s4, 16
3938; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3939; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
3940; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3941; GFX1064-NEXT:    v_readlane_b32 s7, v2, 63
3942; GFX1064-NEXT:    v_readlane_b32 s6, v2, 47
3943; GFX1064-NEXT:    v_writelane_b32 v1, s5, 32
3944; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3945; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
3946; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
3947; GFX1064-NEXT:    v_writelane_b32 v1, s6, 48
3948; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
3949; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3950; GFX1064-NEXT:    s_mov_b32 s2, -1
3951; GFX1064-NEXT:    ; implicit-def: $vgpr0
3952; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3953; GFX1064-NEXT:    s_cbranch_execz BB19_2
3954; GFX1064-NEXT:  ; %bb.1:
3955; GFX1064-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
3956; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
3957; GFX1064-NEXT:    s_mov_b32 s3, s7
3958; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3959; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
3960; GFX1064-NEXT:    ds_min_rtn_i32 v0, v7, v4
3961; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3962; GFX1064-NEXT:    buffer_gl0_inv
3963; GFX1064-NEXT:    buffer_gl1_inv
3964; GFX1064-NEXT:  BB19_2:
3965; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
3966; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
3967; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
3968; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
3969; GFX1064-NEXT:    v_min_i32_e32 v0, s3, v0
3970; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
3971; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3972; GFX1064-NEXT:    s_nop 0
3973; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3974; GFX1064-NEXT:    s_endpgm
3975;
3976; GFX1032-LABEL: min_i32_varying:
3977; GFX1032:       ; %bb.0: ; %entry
3978; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
3979; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3980; GFX1032-NEXT:    v_bfrev_b32_e32 v1, -2
3981; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3982; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3983; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
3984; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3985; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3986; GFX1032-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3987; GFX1032-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3988; GFX1032-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3989; GFX1032-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3990; GFX1032-NEXT:    v_mov_b32_e32 v3, v2
3991; GFX1032-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
3992; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3993; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3994; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3995; GFX1032-NEXT:    v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3996; GFX1032-NEXT:    v_readlane_b32 s3, v2, 15
3997; GFX1032-NEXT:    v_readlane_b32 s4, v2, 31
3998; GFX1032-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3999; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4000; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4001; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4002; GFX1032-NEXT:    v_writelane_b32 v1, s3, 16
4003; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4004; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4005; GFX1032-NEXT:    s_mov_b32 s2, -1
4006; GFX1032-NEXT:    ; implicit-def: $vgpr0
4007; GFX1032-NEXT:    ; implicit-def: $vcc_hi
4008; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
4009; GFX1032-NEXT:    s_cbranch_execz BB19_2
4010; GFX1032-NEXT:  ; %bb.1:
4011; GFX1032-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
4012; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
4013; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4014; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
4015; GFX1032-NEXT:    ds_min_rtn_i32 v0, v7, v4
4016; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4017; GFX1032-NEXT:    buffer_gl0_inv
4018; GFX1032-NEXT:    buffer_gl1_inv
4019; GFX1032-NEXT:  BB19_2:
4020; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
4021; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
4022; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
4023; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
4024; GFX1032-NEXT:    v_min_i32_e32 v0, s3, v0
4025; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
4026; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4027; GFX1032-NEXT:    s_nop 0
4028; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4029; GFX1032-NEXT:    s_endpgm
4030entry:
4031  %lane = call i32 @llvm.amdgcn.workitem.id.x()
4032  %old = atomicrmw min i32 addrspace(3)* @local_var32, i32 %lane acq_rel
4033  store i32 %old, i32 addrspace(1)* %out
4034  ret void
4035}
4036
4037define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) {
4038;
4039;
4040; GFX7LESS-LABEL: min_i64_constant:
4041; GFX7LESS:       ; %bb.0: ; %entry
4042; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4043; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4044; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
4045; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4046; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
4047; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4048; GFX7LESS-NEXT:    s_cbranch_execz BB20_2
4049; GFX7LESS-NEXT:  ; %bb.1:
4050; GFX7LESS-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4051; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
4052; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
4053; GFX7LESS-NEXT:    s_mov_b32 m0, -1
4054; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4055; GFX7LESS-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
4056; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4057; GFX7LESS-NEXT:    buffer_wbinvl1
4058; GFX7LESS-NEXT:  BB20_2:
4059; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
4060; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
4061; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
4062; GFX7LESS-NEXT:    v_bfrev_b32_e32 v1, -2
4063; GFX7LESS-NEXT:    s_mov_b32 s2, -1
4064; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
4065; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
4066; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
4067; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s4
4068; GFX7LESS-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
4069; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4070; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
4071; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
4072; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4073; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4074; GFX7LESS-NEXT:    s_endpgm
4075;
4076; GFX8-LABEL: min_i64_constant:
4077; GFX8:       ; %bb.0: ; %entry
4078; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4079; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4080; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4081; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4082; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
4083; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4084; GFX8-NEXT:    s_cbranch_execz BB20_2
4085; GFX8-NEXT:  ; %bb.1:
4086; GFX8-NEXT:    v_mov_b32_e32 v0, 5
4087; GFX8-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4088; GFX8-NEXT:    v_mov_b32_e32 v1, 0
4089; GFX8-NEXT:    s_mov_b32 m0, -1
4090; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4091; GFX8-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
4092; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4093; GFX8-NEXT:    buffer_wbinvl1_vol
4094; GFX8-NEXT:  BB20_2:
4095; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
4096; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
4097; GFX8-NEXT:    v_bfrev_b32_e32 v0, -2
4098; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
4099; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
4100; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
4101; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
4102; GFX8-NEXT:    v_mov_b32_e32 v2, s5
4103; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4104; GFX8-NEXT:    v_mov_b32_e32 v2, s4
4105; GFX8-NEXT:    s_mov_b32 s2, -1
4106; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4107; GFX8-NEXT:    s_mov_b32 s3, 0xf000
4108; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4109; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4110; GFX8-NEXT:    s_endpgm
4111;
4112; GFX9-LABEL: min_i64_constant:
4113; GFX9:       ; %bb.0: ; %entry
4114; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4115; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4116; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4117; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4118; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
4119; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4120; GFX9-NEXT:    s_cbranch_execz BB20_2
4121; GFX9-NEXT:  ; %bb.1:
4122; GFX9-NEXT:    v_mov_b32_e32 v0, 5
4123; GFX9-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4124; GFX9-NEXT:    v_mov_b32_e32 v1, 0
4125; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4126; GFX9-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
4127; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4128; GFX9-NEXT:    buffer_wbinvl1_vol
4129; GFX9-NEXT:  BB20_2:
4130; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
4131; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
4132; GFX9-NEXT:    v_bfrev_b32_e32 v0, -2
4133; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
4134; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
4135; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
4136; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
4137; GFX9-NEXT:    v_mov_b32_e32 v2, s5
4138; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4139; GFX9-NEXT:    v_mov_b32_e32 v2, s4
4140; GFX9-NEXT:    s_mov_b32 s2, -1
4141; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4142; GFX9-NEXT:    s_mov_b32 s3, 0xf000
4143; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4144; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4145; GFX9-NEXT:    s_endpgm
4146;
4147; GFX1064-LABEL: min_i64_constant:
4148; GFX1064:       ; %bb.0: ; %entry
4149; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4150; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4151; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
4152; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4153; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
4154; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4155; GFX1064-NEXT:    s_cbranch_execz BB20_2
4156; GFX1064-NEXT:  ; %bb.1:
4157; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
4158; GFX1064-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4159; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
4160; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4161; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
4162; GFX1064-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
4163; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4164; GFX1064-NEXT:    buffer_gl0_inv
4165; GFX1064-NEXT:    buffer_gl1_inv
4166; GFX1064-NEXT:  BB20_2:
4167; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
4168; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
4169; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
4170; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
4171; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc
4172; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
4173; GFX1064-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
4174; GFX1064-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
4175; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
4176; GFX1064-NEXT:    s_mov_b32 s2, -1
4177; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
4178; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4179; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4180; GFX1064-NEXT:    s_endpgm
4181;
4182; GFX1032-LABEL: min_i64_constant:
4183; GFX1032:       ; %bb.0: ; %entry
4184; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4185; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4186; GFX1032-NEXT:    ; implicit-def: $vcc_hi
4187; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4188; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
4189; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
4190; GFX1032-NEXT:    s_cbranch_execz BB20_2
4191; GFX1032-NEXT:  ; %bb.1:
4192; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
4193; GFX1032-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4194; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
4195; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4196; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
4197; GFX1032-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
4198; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4199; GFX1032-NEXT:    buffer_gl0_inv
4200; GFX1032-NEXT:    buffer_gl1_inv
4201; GFX1032-NEXT:  BB20_2:
4202; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
4203; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
4204; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
4205; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
4206; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo
4207; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc_lo
4208; GFX1032-NEXT:    v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1]
4209; GFX1032-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
4210; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
4211; GFX1032-NEXT:    s_mov_b32 s2, -1
4212; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
4213; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4214; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4215; GFX1032-NEXT:    s_endpgm
4216entry:
4217  %old = atomicrmw min i64 addrspace(3)* @local_var64, i64 5 acq_rel
4218  store i64 %old, i64 addrspace(1)* %out
4219  ret void
4220}
4221
4222define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
4223;
4224;
4225; GFX7LESS-LABEL: umax_i32_varying:
4226; GFX7LESS:       ; %bb.0: ; %entry
4227; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4228; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
4229; GFX7LESS-NEXT:    s_mov_b32 m0, -1
4230; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4231; GFX7LESS-NEXT:    ds_max_rtn_u32 v0, v1, v0
4232; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4233; GFX7LESS-NEXT:    buffer_wbinvl1
4234; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
4235; GFX7LESS-NEXT:    s_mov_b32 s2, -1
4236; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4237; GFX7LESS-NEXT:    s_endpgm
4238;
4239; GFX8-LABEL: umax_i32_varying:
4240; GFX8:       ; %bb.0: ; %entry
4241; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4242; GFX8-NEXT:    v_mov_b32_e32 v2, v0
4243; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
4244; GFX8-NEXT:    v_mov_b32_e32 v1, 0
4245; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
4246; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4247; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4248; GFX8-NEXT:    s_not_b64 exec, exec
4249; GFX8-NEXT:    v_mov_b32_e32 v2, 0
4250; GFX8-NEXT:    s_not_b64 exec, exec
4251; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
4252; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
4253; GFX8-NEXT:    s_nop 1
4254; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
4255; GFX8-NEXT:    s_nop 1
4256; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
4257; GFX8-NEXT:    s_nop 1
4258; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
4259; GFX8-NEXT:    s_nop 1
4260; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
4261; GFX8-NEXT:    s_nop 1
4262; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
4263; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
4264; GFX8-NEXT:    s_nop 0
4265; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
4266; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
4267; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4268; GFX8-NEXT:    ; implicit-def: $vgpr0
4269; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4270; GFX8-NEXT:    s_cbranch_execz BB21_2
4271; GFX8-NEXT:  ; %bb.1:
4272; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
4273; GFX8-NEXT:    v_mov_b32_e32 v3, s4
4274; GFX8-NEXT:    s_mov_b32 m0, -1
4275; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4276; GFX8-NEXT:    ds_max_rtn_u32 v0, v0, v3
4277; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4278; GFX8-NEXT:    buffer_wbinvl1_vol
4279; GFX8-NEXT:  BB21_2:
4280; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
4281; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
4282; GFX8-NEXT:    v_mov_b32_e32 v0, v1
4283; GFX8-NEXT:    v_max_u32_e32 v0, s2, v0
4284; GFX8-NEXT:    s_mov_b32 s3, 0xf000
4285; GFX8-NEXT:    s_mov_b32 s2, -1
4286; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4287; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4288; GFX8-NEXT:    s_endpgm
4289;
4290; GFX9-LABEL: umax_i32_varying:
4291; GFX9:       ; %bb.0: ; %entry
4292; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4293; GFX9-NEXT:    v_mov_b32_e32 v2, v0
4294; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
4295; GFX9-NEXT:    v_mov_b32_e32 v1, 0
4296; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
4297; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4298; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4299; GFX9-NEXT:    s_not_b64 exec, exec
4300; GFX9-NEXT:    v_mov_b32_e32 v2, 0
4301; GFX9-NEXT:    s_not_b64 exec, exec
4302; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
4303; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
4304; GFX9-NEXT:    s_nop 1
4305; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
4306; GFX9-NEXT:    s_nop 1
4307; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
4308; GFX9-NEXT:    s_nop 1
4309; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
4310; GFX9-NEXT:    s_nop 1
4311; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
4312; GFX9-NEXT:    s_nop 1
4313; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
4314; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
4315; GFX9-NEXT:    s_nop 0
4316; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
4317; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
4318; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4319; GFX9-NEXT:    ; implicit-def: $vgpr0
4320; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4321; GFX9-NEXT:    s_cbranch_execz BB21_2
4322; GFX9-NEXT:  ; %bb.1:
4323; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
4324; GFX9-NEXT:    v_mov_b32_e32 v3, s4
4325; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4326; GFX9-NEXT:    ds_max_rtn_u32 v0, v0, v3
4327; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4328; GFX9-NEXT:    buffer_wbinvl1_vol
4329; GFX9-NEXT:  BB21_2:
4330; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
4331; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
4332; GFX9-NEXT:    v_mov_b32_e32 v0, v1
4333; GFX9-NEXT:    v_max_u32_e32 v0, s2, v0
4334; GFX9-NEXT:    s_mov_b32 s3, 0xf000
4335; GFX9-NEXT:    s_mov_b32 s2, -1
4336; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4337; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4338; GFX9-NEXT:    s_endpgm
4339;
4340; GFX1064-LABEL: umax_i32_varying:
4341; GFX1064:       ; %bb.0: ; %entry
4342; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
4343; GFX1064-NEXT:    s_not_b64 exec, exec
4344; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
4345; GFX1064-NEXT:    s_not_b64 exec, exec
4346; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4347; GFX1064-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
4348; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
4349; GFX1064-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
4350; GFX1064-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
4351; GFX1064-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
4352; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
4353; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4354; GFX1064-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4355; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
4356; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
4357; GFX1064-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
4358; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
4359; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4360; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4361; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4362; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4363; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
4364; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
4365; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4366; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4367; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4368; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
4369; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
4370; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
4371; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4372; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
4373; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
4374; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
4375; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
4376; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4377; GFX1064-NEXT:    s_mov_b32 s2, -1
4378; GFX1064-NEXT:    ; implicit-def: $vgpr0
4379; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4380; GFX1064-NEXT:    s_cbranch_execz BB21_2
4381; GFX1064-NEXT:  ; %bb.1:
4382; GFX1064-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
4383; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
4384; GFX1064-NEXT:    s_mov_b32 s3, s7
4385; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4386; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
4387; GFX1064-NEXT:    ds_max_rtn_u32 v0, v7, v4
4388; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4389; GFX1064-NEXT:    buffer_gl0_inv
4390; GFX1064-NEXT:    buffer_gl1_inv
4391; GFX1064-NEXT:  BB21_2:
4392; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
4393; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
4394; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
4395; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
4396; GFX1064-NEXT:    v_max_u32_e32 v0, s3, v0
4397; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
4398; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4399; GFX1064-NEXT:    s_nop 0
4400; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4401; GFX1064-NEXT:    s_endpgm
4402;
4403; GFX1032-LABEL: umax_i32_varying:
4404; GFX1032:       ; %bb.0: ; %entry
4405; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
4406; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
4407; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
4408; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
4409; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4410; GFX1032-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
4411; GFX1032-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
4412; GFX1032-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
4413; GFX1032-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
4414; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
4415; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4416; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4417; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4418; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4419; GFX1032-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4420; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
4421; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
4422; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
4423; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4424; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4425; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4426; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4427; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
4428; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4429; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4430; GFX1032-NEXT:    s_mov_b32 s2, -1
4431; GFX1032-NEXT:    ; implicit-def: $vgpr0
4432; GFX1032-NEXT:    ; implicit-def: $vcc_hi
4433; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
4434; GFX1032-NEXT:    s_cbranch_execz BB21_2
4435; GFX1032-NEXT:  ; %bb.1:
4436; GFX1032-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
4437; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
4438; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4439; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
4440; GFX1032-NEXT:    ds_max_rtn_u32 v0, v7, v4
4441; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4442; GFX1032-NEXT:    buffer_gl0_inv
4443; GFX1032-NEXT:    buffer_gl1_inv
4444; GFX1032-NEXT:  BB21_2:
4445; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
4446; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
4447; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
4448; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
4449; GFX1032-NEXT:    v_max_u32_e32 v0, s3, v0
4450; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
4451; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4452; GFX1032-NEXT:    s_nop 0
4453; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4454; GFX1032-NEXT:    s_endpgm
4455entry:
4456  %lane = call i32 @llvm.amdgcn.workitem.id.x()
4457  %old = atomicrmw umax i32 addrspace(3)* @local_var32, i32 %lane acq_rel
4458  store i32 %old, i32 addrspace(1)* %out
4459  ret void
4460}
4461
4462define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) {
4463;
4464;
4465; GFX7LESS-LABEL: umax_i64_constant:
4466; GFX7LESS:       ; %bb.0: ; %entry
4467; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4468; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4469; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
4470; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4471; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
4472; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4473; GFX7LESS-NEXT:    s_cbranch_execz BB22_2
4474; GFX7LESS-NEXT:  ; %bb.1:
4475; GFX7LESS-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4476; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
4477; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
4478; GFX7LESS-NEXT:    s_mov_b32 m0, -1
4479; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4480; GFX7LESS-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
4481; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4482; GFX7LESS-NEXT:    buffer_wbinvl1
4483; GFX7LESS-NEXT:  BB22_2:
4484; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
4485; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
4486; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
4487; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
4488; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
4489; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
4490; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
4491; GFX7LESS-NEXT:    v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
4492; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4493; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s5
4494; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
4495; GFX7LESS-NEXT:    s_mov_b32 s2, -1
4496; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4497; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4498; GFX7LESS-NEXT:    s_endpgm
4499;
4500; GFX8-LABEL: umax_i64_constant:
4501; GFX8:       ; %bb.0: ; %entry
4502; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4503; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4504; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4505; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4506; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
4507; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4508; GFX8-NEXT:    s_cbranch_execz BB22_2
4509; GFX8-NEXT:  ; %bb.1:
4510; GFX8-NEXT:    v_mov_b32_e32 v0, 5
4511; GFX8-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4512; GFX8-NEXT:    v_mov_b32_e32 v1, 0
4513; GFX8-NEXT:    s_mov_b32 m0, -1
4514; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4515; GFX8-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
4516; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4517; GFX8-NEXT:    buffer_wbinvl1_vol
4518; GFX8-NEXT:  BB22_2:
4519; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
4520; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
4521; GFX8-NEXT:    v_readfirstlane_b32 s3, v1
4522; GFX8-NEXT:    v_mov_b32_e32 v1, 0
4523; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
4524; GFX8-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
4525; GFX8-NEXT:    v_mov_b32_e32 v1, s3
4526; GFX8-NEXT:    v_mov_b32_e32 v2, s2
4527; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4528; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
4529; GFX8-NEXT:    s_mov_b32 s3, 0xf000
4530; GFX8-NEXT:    s_mov_b32 s2, -1
4531; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4532; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4533; GFX8-NEXT:    s_endpgm
4534;
4535; GFX9-LABEL: umax_i64_constant:
4536; GFX9:       ; %bb.0: ; %entry
4537; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4538; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4539; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4540; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4541; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
4542; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4543; GFX9-NEXT:    s_cbranch_execz BB22_2
4544; GFX9-NEXT:  ; %bb.1:
4545; GFX9-NEXT:    v_mov_b32_e32 v0, 5
4546; GFX9-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4547; GFX9-NEXT:    v_mov_b32_e32 v1, 0
4548; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4549; GFX9-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
4550; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4551; GFX9-NEXT:    buffer_wbinvl1_vol
4552; GFX9-NEXT:  BB22_2:
4553; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
4554; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
4555; GFX9-NEXT:    v_readfirstlane_b32 s3, v1
4556; GFX9-NEXT:    v_mov_b32_e32 v1, 0
4557; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
4558; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
4559; GFX9-NEXT:    v_mov_b32_e32 v1, s3
4560; GFX9-NEXT:    v_mov_b32_e32 v2, s2
4561; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4562; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
4563; GFX9-NEXT:    s_mov_b32 s3, 0xf000
4564; GFX9-NEXT:    s_mov_b32 s2, -1
4565; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4566; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4567; GFX9-NEXT:    s_endpgm
4568;
4569; GFX1064-LABEL: umax_i64_constant:
4570; GFX1064:       ; %bb.0: ; %entry
4571; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4572; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4573; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
4574; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4575; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
4576; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4577; GFX1064-NEXT:    s_cbranch_execz BB22_2
4578; GFX1064-NEXT:  ; %bb.1:
4579; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
4580; GFX1064-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4581; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
4582; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4583; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
4584; GFX1064-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
4585; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4586; GFX1064-NEXT:    buffer_gl0_inv
4587; GFX1064-NEXT:    buffer_gl1_inv
4588; GFX1064-NEXT:  BB22_2:
4589; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
4590; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
4591; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
4592; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
4593; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
4594; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
4595; GFX1064-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
4596; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
4597; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, s3, vcc
4598; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
4599; GFX1064-NEXT:    s_mov_b32 s2, -1
4600; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4601; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4602; GFX1064-NEXT:    s_endpgm
4603;
4604; GFX1032-LABEL: umax_i64_constant:
4605; GFX1032:       ; %bb.0: ; %entry
4606; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4607; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4608; GFX1032-NEXT:    ; implicit-def: $vcc_hi
4609; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4610; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
4611; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
4612; GFX1032-NEXT:    s_cbranch_execz BB22_2
4613; GFX1032-NEXT:  ; %bb.1:
4614; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
4615; GFX1032-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4616; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
4617; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4618; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
4619; GFX1032-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
4620; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4621; GFX1032-NEXT:    buffer_gl0_inv
4622; GFX1032-NEXT:    buffer_gl1_inv
4623; GFX1032-NEXT:  BB22_2:
4624; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
4625; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
4626; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
4627; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
4628; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
4629; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc_lo
4630; GFX1032-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1]
4631; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
4632; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, s3, vcc_lo
4633; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
4634; GFX1032-NEXT:    s_mov_b32 s2, -1
4635; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4636; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4637; GFX1032-NEXT:    s_endpgm
4638entry:
4639  %old = atomicrmw umax i64 addrspace(3)* @local_var64, i64 5 acq_rel
4640  store i64 %old, i64 addrspace(1)* %out
4641  ret void
4642}
4643
4644define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) {
4645;
4646;
4647; GFX7LESS-LABEL: umin_i32_varying:
4648; GFX7LESS:       ; %bb.0: ; %entry
4649; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4650; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
4651; GFX7LESS-NEXT:    s_mov_b32 m0, -1
4652; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4653; GFX7LESS-NEXT:    ds_min_rtn_u32 v0, v1, v0
4654; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4655; GFX7LESS-NEXT:    buffer_wbinvl1
4656; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
4657; GFX7LESS-NEXT:    s_mov_b32 s2, -1
4658; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4659; GFX7LESS-NEXT:    s_endpgm
4660;
4661; GFX8-LABEL: umin_i32_varying:
4662; GFX8:       ; %bb.0: ; %entry
4663; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4664; GFX8-NEXT:    v_mov_b32_e32 v2, v0
4665; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4666; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4667; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
4668; GFX8-NEXT:    v_mov_b32_e32 v1, -1
4669; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
4670; GFX8-NEXT:    s_not_b64 exec, exec
4671; GFX8-NEXT:    v_mov_b32_e32 v2, -1
4672; GFX8-NEXT:    s_not_b64 exec, exec
4673; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
4674; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
4675; GFX8-NEXT:    s_nop 1
4676; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
4677; GFX8-NEXT:    s_nop 1
4678; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
4679; GFX8-NEXT:    s_nop 1
4680; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
4681; GFX8-NEXT:    s_nop 1
4682; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
4683; GFX8-NEXT:    s_nop 1
4684; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
4685; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
4686; GFX8-NEXT:    s_nop 0
4687; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
4688; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
4689; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4690; GFX8-NEXT:    ; implicit-def: $vgpr0
4691; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4692; GFX8-NEXT:    s_cbranch_execz BB23_2
4693; GFX8-NEXT:  ; %bb.1:
4694; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
4695; GFX8-NEXT:    v_mov_b32_e32 v3, s4
4696; GFX8-NEXT:    s_mov_b32 m0, -1
4697; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4698; GFX8-NEXT:    ds_min_rtn_u32 v0, v0, v3
4699; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4700; GFX8-NEXT:    buffer_wbinvl1_vol
4701; GFX8-NEXT:  BB23_2:
4702; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
4703; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
4704; GFX8-NEXT:    v_mov_b32_e32 v0, v1
4705; GFX8-NEXT:    v_min_u32_e32 v0, s2, v0
4706; GFX8-NEXT:    s_mov_b32 s3, 0xf000
4707; GFX8-NEXT:    s_mov_b32 s2, -1
4708; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4709; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4710; GFX8-NEXT:    s_endpgm
4711;
4712; GFX9-LABEL: umin_i32_varying:
4713; GFX9:       ; %bb.0: ; %entry
4714; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4715; GFX9-NEXT:    v_mov_b32_e32 v2, v0
4716; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4717; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4718; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
4719; GFX9-NEXT:    v_mov_b32_e32 v1, -1
4720; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
4721; GFX9-NEXT:    s_not_b64 exec, exec
4722; GFX9-NEXT:    v_mov_b32_e32 v2, -1
4723; GFX9-NEXT:    s_not_b64 exec, exec
4724; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
4725; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
4726; GFX9-NEXT:    s_nop 1
4727; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
4728; GFX9-NEXT:    s_nop 1
4729; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
4730; GFX9-NEXT:    s_nop 1
4731; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
4732; GFX9-NEXT:    s_nop 1
4733; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
4734; GFX9-NEXT:    s_nop 1
4735; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
4736; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
4737; GFX9-NEXT:    s_nop 0
4738; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
4739; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
4740; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4741; GFX9-NEXT:    ; implicit-def: $vgpr0
4742; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4743; GFX9-NEXT:    s_cbranch_execz BB23_2
4744; GFX9-NEXT:  ; %bb.1:
4745; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
4746; GFX9-NEXT:    v_mov_b32_e32 v3, s4
4747; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4748; GFX9-NEXT:    ds_min_rtn_u32 v0, v0, v3
4749; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4750; GFX9-NEXT:    buffer_wbinvl1_vol
4751; GFX9-NEXT:  BB23_2:
4752; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
4753; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
4754; GFX9-NEXT:    v_mov_b32_e32 v0, v1
4755; GFX9-NEXT:    v_min_u32_e32 v0, s2, v0
4756; GFX9-NEXT:    s_mov_b32 s3, 0xf000
4757; GFX9-NEXT:    s_mov_b32 s2, -1
4758; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4759; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4760; GFX9-NEXT:    s_endpgm
4761;
4762; GFX1064-LABEL: umin_i32_varying:
4763; GFX1064:       ; %bb.0: ; %entry
4764; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
4765; GFX1064-NEXT:    s_not_b64 exec, exec
4766; GFX1064-NEXT:    v_mov_b32_e32 v1, -1
4767; GFX1064-NEXT:    s_not_b64 exec, exec
4768; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4769; GFX1064-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4770; GFX1064-NEXT:    v_mov_b32_e32 v3, -1
4771; GFX1064-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
4772; GFX1064-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
4773; GFX1064-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
4774; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
4775; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4776; GFX1064-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4777; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
4778; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
4779; GFX1064-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
4780; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
4781; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4782; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4783; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4784; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4785; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
4786; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
4787; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4788; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4789; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4790; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
4791; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
4792; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
4793; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4794; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
4795; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
4796; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
4797; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
4798; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4799; GFX1064-NEXT:    s_mov_b32 s2, -1
4800; GFX1064-NEXT:    ; implicit-def: $vgpr0
4801; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4802; GFX1064-NEXT:    s_cbranch_execz BB23_2
4803; GFX1064-NEXT:  ; %bb.1:
4804; GFX1064-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
4805; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
4806; GFX1064-NEXT:    s_mov_b32 s3, s7
4807; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4808; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
4809; GFX1064-NEXT:    ds_min_rtn_u32 v0, v7, v4
4810; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4811; GFX1064-NEXT:    buffer_gl0_inv
4812; GFX1064-NEXT:    buffer_gl1_inv
4813; GFX1064-NEXT:  BB23_2:
4814; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
4815; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
4816; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
4817; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
4818; GFX1064-NEXT:    v_min_u32_e32 v0, s3, v0
4819; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
4820; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4821; GFX1064-NEXT:    s_nop 0
4822; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4823; GFX1064-NEXT:    s_endpgm
4824;
4825; GFX1032-LABEL: umin_i32_varying:
4826; GFX1032:       ; %bb.0: ; %entry
4827; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
4828; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
4829; GFX1032-NEXT:    v_mov_b32_e32 v1, -1
4830; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
4831; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4832; GFX1032-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4833; GFX1032-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
4834; GFX1032-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
4835; GFX1032-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
4836; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
4837; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4838; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4839; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4840; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4841; GFX1032-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4842; GFX1032-NEXT:    v_mov_b32_e32 v3, -1
4843; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
4844; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
4845; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4846; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4847; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4848; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4849; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
4850; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4851; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4852; GFX1032-NEXT:    s_mov_b32 s2, -1
4853; GFX1032-NEXT:    ; implicit-def: $vgpr0
4854; GFX1032-NEXT:    ; implicit-def: $vcc_hi
4855; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
4856; GFX1032-NEXT:    s_cbranch_execz BB23_2
4857; GFX1032-NEXT:  ; %bb.1:
4858; GFX1032-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
4859; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
4860; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4861; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
4862; GFX1032-NEXT:    ds_min_rtn_u32 v0, v7, v4
4863; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4864; GFX1032-NEXT:    buffer_gl0_inv
4865; GFX1032-NEXT:    buffer_gl1_inv
4866; GFX1032-NEXT:  BB23_2:
4867; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
4868; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
4869; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
4870; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
4871; GFX1032-NEXT:    v_min_u32_e32 v0, s3, v0
4872; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
4873; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4874; GFX1032-NEXT:    s_nop 0
4875; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4876; GFX1032-NEXT:    s_endpgm
4877entry:
4878  %lane = call i32 @llvm.amdgcn.workitem.id.x()
4879  %old = atomicrmw umin i32 addrspace(3)* @local_var32, i32 %lane acq_rel
4880  store i32 %old, i32 addrspace(1)* %out
4881  ret void
4882}
4883
4884define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) {
4885;
4886;
4887; GFX7LESS-LABEL: umin_i64_constant:
4888; GFX7LESS:       ; %bb.0: ; %entry
4889; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4890; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4891; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
4892; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4893; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
4894; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4895; GFX7LESS-NEXT:    s_cbranch_execz BB24_2
4896; GFX7LESS-NEXT:  ; %bb.1:
4897; GFX7LESS-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4898; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
4899; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
4900; GFX7LESS-NEXT:    s_mov_b32 m0, -1
4901; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4902; GFX7LESS-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
4903; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4904; GFX7LESS-NEXT:    buffer_wbinvl1
4905; GFX7LESS-NEXT:  BB24_2:
4906; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
4907; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
4908; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
4909; GFX7LESS-NEXT:    s_mov_b32 s2, -1
4910; GFX7LESS-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
4911; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
4912; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
4913; GFX7LESS-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
4914; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4915; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
4916; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4917; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
4918; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4919; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4920; GFX7LESS-NEXT:    s_endpgm
4921;
4922; GFX8-LABEL: umin_i64_constant:
4923; GFX8:       ; %bb.0: ; %entry
4924; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4925; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4926; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4927; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4928; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
4929; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4930; GFX8-NEXT:    s_cbranch_execz BB24_2
4931; GFX8-NEXT:  ; %bb.1:
4932; GFX8-NEXT:    v_mov_b32_e32 v0, 5
4933; GFX8-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4934; GFX8-NEXT:    v_mov_b32_e32 v1, 0
4935; GFX8-NEXT:    s_mov_b32 m0, -1
4936; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4937; GFX8-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
4938; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4939; GFX8-NEXT:    buffer_wbinvl1_vol
4940; GFX8-NEXT:  BB24_2:
4941; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
4942; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
4943; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
4944; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
4945; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
4946; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
4947; GFX8-NEXT:    v_mov_b32_e32 v2, s5
4948; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4949; GFX8-NEXT:    v_mov_b32_e32 v2, s4
4950; GFX8-NEXT:    s_mov_b32 s2, -1
4951; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4952; GFX8-NEXT:    s_mov_b32 s3, 0xf000
4953; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4954; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4955; GFX8-NEXT:    s_endpgm
4956;
4957; GFX9-LABEL: umin_i64_constant:
4958; GFX9:       ; %bb.0: ; %entry
4959; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4960; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4961; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4962; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4963; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
4964; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4965; GFX9-NEXT:    s_cbranch_execz BB24_2
4966; GFX9-NEXT:  ; %bb.1:
4967; GFX9-NEXT:    v_mov_b32_e32 v0, 5
4968; GFX9-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4969; GFX9-NEXT:    v_mov_b32_e32 v1, 0
4970; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4971; GFX9-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
4972; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4973; GFX9-NEXT:    buffer_wbinvl1_vol
4974; GFX9-NEXT:  BB24_2:
4975; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
4976; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
4977; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
4978; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
4979; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
4980; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
4981; GFX9-NEXT:    v_mov_b32_e32 v2, s5
4982; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4983; GFX9-NEXT:    v_mov_b32_e32 v2, s4
4984; GFX9-NEXT:    s_mov_b32 s2, -1
4985; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4986; GFX9-NEXT:    s_mov_b32 s3, 0xf000
4987; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4988; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4989; GFX9-NEXT:    s_endpgm
4990;
4991; GFX1064-LABEL: umin_i64_constant:
4992; GFX1064:       ; %bb.0: ; %entry
4993; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4994; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4995; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
4996; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4997; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
4998; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4999; GFX1064-NEXT:    s_cbranch_execz BB24_2
5000; GFX1064-NEXT:  ; %bb.1:
5001; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
5002; GFX1064-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
5003; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
5004; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5005; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
5006; GFX1064-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
5007; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5008; GFX1064-NEXT:    buffer_gl0_inv
5009; GFX1064-NEXT:    buffer_gl1_inv
5010; GFX1064-NEXT:  BB24_2:
5011; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
5012; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
5013; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
5014; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
5015; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
5016; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
5017; GFX1064-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
5018; GFX1064-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
5019; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
5020; GFX1064-NEXT:    s_mov_b32 s2, -1
5021; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
5022; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
5023; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5024; GFX1064-NEXT:    s_endpgm
5025;
5026; GFX1032-LABEL: umin_i64_constant:
5027; GFX1032:       ; %bb.0: ; %entry
5028; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5029; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
5030; GFX1032-NEXT:    ; implicit-def: $vcc_hi
5031; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
5032; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
5033; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
5034; GFX1032-NEXT:    s_cbranch_execz BB24_2
5035; GFX1032-NEXT:  ; %bb.1:
5036; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
5037; GFX1032-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
5038; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
5039; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5040; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
5041; GFX1032-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
5042; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5043; GFX1032-NEXT:    buffer_gl0_inv
5044; GFX1032-NEXT:    buffer_gl1_inv
5045; GFX1032-NEXT:  BB24_2:
5046; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
5047; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
5048; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
5049; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
5050; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
5051; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc_lo
5052; GFX1032-NEXT:    v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1]
5053; GFX1032-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
5054; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
5055; GFX1032-NEXT:    s_mov_b32 s2, -1
5056; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
5057; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
5058; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5059; GFX1032-NEXT:    s_endpgm
5060entry:
5061  %old = atomicrmw umin i64 addrspace(3)* @local_var64, i64 5 acq_rel
5062  store i64 %old, i64 addrspace(1)* %out
5063  ret void
5064}
5065