1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --force-update
2; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS %s
3; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s
4; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
5; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1064 %s
6; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1032 %s
7
8declare i32 @llvm.amdgcn.workitem.id.x()
9
10@local_var32 = addrspace(3) global i32 undef, align 4
11@local_var64 = addrspace(3) global i64 undef, align 8
12
13; Show what the atomic optimization pass will do for local pointers.
14
15define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) {
16;
17;
18; GFX7LESS-LABEL: add_i32_constant:
19; GFX7LESS:       ; %bb.0: ; %entry
20; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
21; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
22; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
23; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
24; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
25; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
26; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
27; GFX7LESS-NEXT:    s_cbranch_execz BB0_2
28; GFX7LESS-NEXT:  ; %bb.1:
29; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
30; GFX7LESS-NEXT:    s_mul_i32 s2, s2, 5
31; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
32; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s2
33; GFX7LESS-NEXT:    s_mov_b32 m0, -1
34; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
35; GFX7LESS-NEXT:    ds_add_rtn_u32 v1, v1, v2
36; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
37; GFX7LESS-NEXT:  BB0_2:
38; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
39; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
40; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v1
41; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
42; GFX7LESS-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
43; GFX7LESS-NEXT:    s_mov_b32 s2, -1
44; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
45; GFX7LESS-NEXT:    s_endpgm
46;
47; GFX8-LABEL: add_i32_constant:
48; GFX8:       ; %bb.0: ; %entry
49; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
50; GFX8-NEXT:    s_mov_b64 s[2:3], exec
51; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
52; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
53; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
54; GFX8-NEXT:    ; implicit-def: $vgpr1
55; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
56; GFX8-NEXT:    s_cbranch_execz BB0_2
57; GFX8-NEXT:  ; %bb.1:
58; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
59; GFX8-NEXT:    s_mul_i32 s2, s2, 5
60; GFX8-NEXT:    v_mov_b32_e32 v1, 0
61; GFX8-NEXT:    v_mov_b32_e32 v2, s2
62; GFX8-NEXT:    s_mov_b32 m0, -1
63; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
64; GFX8-NEXT:    ds_add_rtn_u32 v1, v1, v2
65; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
66; GFX8-NEXT:  BB0_2:
67; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
68; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
69; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
70; GFX8-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
71; GFX8-NEXT:    s_mov_b32 s3, 0xf000
72; GFX8-NEXT:    s_mov_b32 s2, -1
73; GFX8-NEXT:    s_nop 1
74; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
75; GFX8-NEXT:    s_endpgm
76;
77; GFX9-LABEL: add_i32_constant:
78; GFX9:       ; %bb.0: ; %entry
79; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
80; GFX9-NEXT:    s_mov_b64 s[2:3], exec
81; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
82; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
83; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
84; GFX9-NEXT:    ; implicit-def: $vgpr1
85; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
86; GFX9-NEXT:    s_cbranch_execz BB0_2
87; GFX9-NEXT:  ; %bb.1:
88; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
89; GFX9-NEXT:    s_mul_i32 s2, s2, 5
90; GFX9-NEXT:    v_mov_b32_e32 v1, 0
91; GFX9-NEXT:    v_mov_b32_e32 v2, s2
92; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
93; GFX9-NEXT:    ds_add_rtn_u32 v1, v1, v2
94; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
95; GFX9-NEXT:  BB0_2:
96; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
97; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
98; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
99; GFX9-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
100; GFX9-NEXT:    s_mov_b32 s3, 0xf000
101; GFX9-NEXT:    s_mov_b32 s2, -1
102; GFX9-NEXT:    s_nop 1
103; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
104; GFX9-NEXT:    s_endpgm
105;
106; GFX1064-LABEL: add_i32_constant:
107; GFX1064:       ; %bb.0: ; %entry
108; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
109; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
110; GFX1064-NEXT:    ; implicit-def: $vgpr1
111; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
112; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
113; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
114; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
115; GFX1064-NEXT:    s_cbranch_execz BB0_2
116; GFX1064-NEXT:  ; %bb.1:
117; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
118; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
119; GFX1064-NEXT:    s_mul_i32 s2, s2, 5
120; GFX1064-NEXT:    v_mov_b32_e32 v2, s2
121; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
122; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
123; GFX1064-NEXT:    ds_add_rtn_u32 v1, v1, v2
124; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
125; GFX1064-NEXT:    buffer_gl0_inv
126; GFX1064-NEXT:  BB0_2:
127; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
128; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
129; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
130; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
131; GFX1064-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
132; GFX1064-NEXT:    s_mov_b32 s2, -1
133; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
134; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
135; GFX1064-NEXT:    s_endpgm
136;
137; GFX1032-LABEL: add_i32_constant:
138; GFX1032:       ; %bb.0: ; %entry
139; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
140; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
141; GFX1032-NEXT:    ; implicit-def: $vgpr1
142; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
143; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
144; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
145; GFX1032-NEXT:    s_cbranch_execz BB0_2
146; GFX1032-NEXT:  ; %bb.1:
147; GFX1032-NEXT:    s_bcnt1_i32_b32 s3, s3
148; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
149; GFX1032-NEXT:    s_mul_i32 s3, s3, 5
150; GFX1032-NEXT:    v_mov_b32_e32 v2, s3
151; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
152; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
153; GFX1032-NEXT:    ds_add_rtn_u32 v1, v1, v2
154; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
155; GFX1032-NEXT:    buffer_gl0_inv
156; GFX1032-NEXT:  BB0_2:
157; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
158; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
159; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
160; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
161; GFX1032-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
162; GFX1032-NEXT:    s_mov_b32 s2, -1
163; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
164; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
165; GFX1032-NEXT:    s_endpgm
166entry:
167  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 5 acq_rel
168  store i32 %old, i32 addrspace(1)* %out
169  ret void
170}
171
172define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive) {
173;
174;
175; GFX7LESS-LABEL: add_i32_uniform:
176; GFX7LESS:       ; %bb.0: ; %entry
177; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
178; GFX7LESS-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
179; GFX7LESS-NEXT:    s_load_dword s0, s[0:1], 0xb
180; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
181; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
182; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
183; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
184; GFX7LESS-NEXT:    s_and_saveexec_b64 s[6:7], vcc
185; GFX7LESS-NEXT:    s_cbranch_execz BB1_2
186; GFX7LESS-NEXT:  ; %bb.1:
187; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
188; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
189; GFX7LESS-NEXT:    s_mul_i32 s1, s0, s1
190; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
191; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s1
192; GFX7LESS-NEXT:    s_mov_b32 m0, -1
193; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
194; GFX7LESS-NEXT:    ds_add_rtn_u32 v1, v1, v2
195; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
196; GFX7LESS-NEXT:  BB1_2:
197; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[6:7]
198; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
199; GFX7LESS-NEXT:    v_readfirstlane_b32 s1, v1
200; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s0, v0
201; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
202; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s1, v0
203; GFX7LESS-NEXT:    s_mov_b32 s6, -1
204; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[4:7], 0
205; GFX7LESS-NEXT:    s_endpgm
206;
207; GFX8-LABEL: add_i32_uniform:
208; GFX8:       ; %bb.0: ; %entry
209; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
210; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x2c
211; GFX8-NEXT:    s_mov_b64 s[2:3], exec
212; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
213; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
214; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
215; GFX8-NEXT:    ; implicit-def: $vgpr1
216; GFX8-NEXT:    s_and_saveexec_b64 s[6:7], vcc
217; GFX8-NEXT:    s_cbranch_execz BB1_2
218; GFX8-NEXT:  ; %bb.1:
219; GFX8-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
220; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
221; GFX8-NEXT:    s_mul_i32 s1, s0, s1
222; GFX8-NEXT:    v_mov_b32_e32 v1, 0
223; GFX8-NEXT:    v_mov_b32_e32 v2, s1
224; GFX8-NEXT:    s_mov_b32 m0, -1
225; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
226; GFX8-NEXT:    ds_add_rtn_u32 v1, v1, v2
227; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
228; GFX8-NEXT:  BB1_2:
229; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
230; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
231; GFX8-NEXT:    v_mul_lo_u32 v0, s0, v0
232; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
233; GFX8-NEXT:    s_mov_b32 s7, 0xf000
234; GFX8-NEXT:    s_mov_b32 s6, -1
235; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
236; GFX8-NEXT:    buffer_store_dword v0, off, s[4:7], 0
237; GFX8-NEXT:    s_endpgm
238;
239; GFX9-LABEL: add_i32_uniform:
240; GFX9:       ; %bb.0: ; %entry
241; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
242; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
243; GFX9-NEXT:    s_mov_b64 s[6:7], exec
244; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
245; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
246; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
247; GFX9-NEXT:    ; implicit-def: $vgpr1
248; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
249; GFX9-NEXT:    s_cbranch_execz BB1_2
250; GFX9-NEXT:  ; %bb.1:
251; GFX9-NEXT:    s_bcnt1_i32_b64 s3, s[6:7]
252; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
253; GFX9-NEXT:    s_mul_i32 s3, s2, s3
254; GFX9-NEXT:    v_mov_b32_e32 v1, 0
255; GFX9-NEXT:    v_mov_b32_e32 v2, s3
256; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
257; GFX9-NEXT:    ds_add_rtn_u32 v1, v1, v2
258; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
259; GFX9-NEXT:  BB1_2:
260; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
261; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
262; GFX9-NEXT:    v_mul_lo_u32 v0, s2, v0
263; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
264; GFX9-NEXT:    s_mov_b32 s7, 0xf000
265; GFX9-NEXT:    s_mov_b32 s6, -1
266; GFX9-NEXT:    v_add_u32_e32 v0, s0, v0
267; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
268; GFX9-NEXT:    s_endpgm
269;
270; GFX1064-LABEL: add_i32_uniform:
271; GFX1064:       ; %bb.0: ; %entry
272; GFX1064-NEXT:    s_clause 0x1
273; GFX1064-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
274; GFX1064-NEXT:    s_load_dword s2, s[0:1], 0x2c
275; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
276; GFX1064-NEXT:    ; implicit-def: $vgpr1
277; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
278; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
279; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
280; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
281; GFX1064-NEXT:    s_cbranch_execz BB1_2
282; GFX1064-NEXT:  ; %bb.1:
283; GFX1064-NEXT:    s_bcnt1_i32_b64 s3, s[6:7]
284; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
285; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
286; GFX1064-NEXT:    s_mul_i32 s3, s2, s3
287; GFX1064-NEXT:    v_mov_b32_e32 v2, s3
288; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
289; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
290; GFX1064-NEXT:    ds_add_rtn_u32 v1, v1, v2
291; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
292; GFX1064-NEXT:    buffer_gl0_inv
293; GFX1064-NEXT:  BB1_2:
294; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
295; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
296; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
297; GFX1064-NEXT:    v_mul_lo_u32 v0, s2, v0
298; GFX1064-NEXT:    v_readfirstlane_b32 s0, v1
299; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
300; GFX1064-NEXT:    s_mov_b32 s6, -1
301; GFX1064-NEXT:    v_add_nc_u32_e32 v0, s0, v0
302; GFX1064-NEXT:    buffer_store_dword v0, off, s[4:7], 0
303; GFX1064-NEXT:    s_endpgm
304;
305; GFX1032-LABEL: add_i32_uniform:
306; GFX1032:       ; %bb.0: ; %entry
307; GFX1032-NEXT:    s_clause 0x1
308; GFX1032-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
309; GFX1032-NEXT:    s_load_dword s2, s[0:1], 0x2c
310; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
311; GFX1032-NEXT:    ; implicit-def: $vgpr1
312; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
313; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
314; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
315; GFX1032-NEXT:    s_cbranch_execz BB1_2
316; GFX1032-NEXT:  ; %bb.1:
317; GFX1032-NEXT:    s_bcnt1_i32_b32 s1, s3
318; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
319; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
320; GFX1032-NEXT:    s_mul_i32 s1, s2, s1
321; GFX1032-NEXT:    v_mov_b32_e32 v2, s1
322; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
323; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
324; GFX1032-NEXT:    ds_add_rtn_u32 v1, v1, v2
325; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
326; GFX1032-NEXT:    buffer_gl0_inv
327; GFX1032-NEXT:  BB1_2:
328; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
329; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s0
330; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
331; GFX1032-NEXT:    v_mul_lo_u32 v0, s2, v0
332; GFX1032-NEXT:    v_readfirstlane_b32 s0, v1
333; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
334; GFX1032-NEXT:    s_mov_b32 s6, -1
335; GFX1032-NEXT:    v_add_nc_u32_e32 v0, s0, v0
336; GFX1032-NEXT:    buffer_store_dword v0, off, s[4:7], 0
337; GFX1032-NEXT:    s_endpgm
338entry:
339  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %additive acq_rel
340  store i32 %old, i32 addrspace(1)* %out
341  ret void
342}
343
344define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
345;
346;
347; GFX7LESS-LABEL: add_i32_varying:
348; GFX7LESS:       ; %bb.0: ; %entry
349; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
350; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
351; GFX7LESS-NEXT:    s_mov_b32 m0, -1
352; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
353; GFX7LESS-NEXT:    ds_add_rtn_u32 v0, v1, v0
354; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
355; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
356; GFX7LESS-NEXT:    s_mov_b32 s2, -1
357; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
358; GFX7LESS-NEXT:    s_endpgm
359;
360; GFX8-LABEL: add_i32_varying:
361; GFX8:       ; %bb.0: ; %entry
362; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
363; GFX8-NEXT:    v_mov_b32_e32 v2, v0
364; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
365; GFX8-NEXT:    v_mov_b32_e32 v1, 0
366; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
367; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
368; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
369; GFX8-NEXT:    s_not_b64 exec, exec
370; GFX8-NEXT:    v_mov_b32_e32 v2, 0
371; GFX8-NEXT:    s_not_b64 exec, exec
372; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
373; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
374; GFX8-NEXT:    s_nop 1
375; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
376; GFX8-NEXT:    s_nop 1
377; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
378; GFX8-NEXT:    s_nop 1
379; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
380; GFX8-NEXT:    s_nop 1
381; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
382; GFX8-NEXT:    s_nop 1
383; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
384; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
385; GFX8-NEXT:    s_nop 0
386; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
387; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
388; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
389; GFX8-NEXT:    ; implicit-def: $vgpr0
390; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
391; GFX8-NEXT:    s_cbranch_execz BB2_2
392; GFX8-NEXT:  ; %bb.1:
393; GFX8-NEXT:    v_mov_b32_e32 v0, 0
394; GFX8-NEXT:    v_mov_b32_e32 v3, s4
395; GFX8-NEXT:    s_mov_b32 m0, -1
396; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
397; GFX8-NEXT:    ds_add_rtn_u32 v0, v0, v3
398; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
399; GFX8-NEXT:  BB2_2:
400; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
401; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
402; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
403; GFX8-NEXT:    v_mov_b32_e32 v0, v1
404; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
405; GFX8-NEXT:    s_mov_b32 s3, 0xf000
406; GFX8-NEXT:    s_mov_b32 s2, -1
407; GFX8-NEXT:    s_nop 0
408; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
409; GFX8-NEXT:    s_endpgm
410;
411; GFX9-LABEL: add_i32_varying:
412; GFX9:       ; %bb.0: ; %entry
413; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
414; GFX9-NEXT:    v_mov_b32_e32 v2, v0
415; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
416; GFX9-NEXT:    v_mov_b32_e32 v1, 0
417; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
418; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
419; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
420; GFX9-NEXT:    s_not_b64 exec, exec
421; GFX9-NEXT:    v_mov_b32_e32 v2, 0
422; GFX9-NEXT:    s_not_b64 exec, exec
423; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
424; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
425; GFX9-NEXT:    s_nop 1
426; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
427; GFX9-NEXT:    s_nop 1
428; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
429; GFX9-NEXT:    s_nop 1
430; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
431; GFX9-NEXT:    s_nop 1
432; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
433; GFX9-NEXT:    s_nop 1
434; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
435; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
436; GFX9-NEXT:    s_nop 0
437; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
438; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
439; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
440; GFX9-NEXT:    ; implicit-def: $vgpr0
441; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
442; GFX9-NEXT:    s_cbranch_execz BB2_2
443; GFX9-NEXT:  ; %bb.1:
444; GFX9-NEXT:    v_mov_b32_e32 v0, 0
445; GFX9-NEXT:    v_mov_b32_e32 v3, s4
446; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
447; GFX9-NEXT:    ds_add_rtn_u32 v0, v0, v3
448; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
449; GFX9-NEXT:  BB2_2:
450; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
451; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
452; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
453; GFX9-NEXT:    v_mov_b32_e32 v0, v1
454; GFX9-NEXT:    v_add_u32_e32 v0, s2, v0
455; GFX9-NEXT:    s_mov_b32 s3, 0xf000
456; GFX9-NEXT:    s_mov_b32 s2, -1
457; GFX9-NEXT:    s_nop 0
458; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
459; GFX9-NEXT:    s_endpgm
460;
461; GFX1064-LABEL: add_i32_varying:
462; GFX1064:       ; %bb.0: ; %entry
463; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
464; GFX1064-NEXT:    s_not_b64 exec, exec
465; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
466; GFX1064-NEXT:    s_not_b64 exec, exec
467; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
468; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
469; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
470; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
471; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
472; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
473; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
474; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
475; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
476; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
477; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
478; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
479; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
480; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
481; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
482; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
483; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
484; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
485; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
486; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
487; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
488; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
489; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
490; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
491; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
492; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
493; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
494; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
495; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
496; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
497; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
498; GFX1064-NEXT:    s_mov_b32 s2, -1
499; GFX1064-NEXT:    ; implicit-def: $vgpr0
500; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
501; GFX1064-NEXT:    s_cbranch_execz BB2_2
502; GFX1064-NEXT:  ; %bb.1:
503; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
504; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
505; GFX1064-NEXT:    s_mov_b32 s3, s7
506; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
507; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
508; GFX1064-NEXT:    ds_add_rtn_u32 v0, v0, v4
509; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
510; GFX1064-NEXT:    buffer_gl0_inv
511; GFX1064-NEXT:  BB2_2:
512; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
513; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
514; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
515; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
516; GFX1064-NEXT:    v_add_nc_u32_e32 v0, s3, v0
517; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
518; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
519; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
520; GFX1064-NEXT:    s_endpgm
521;
522; GFX1032-LABEL: add_i32_varying:
523; GFX1032:       ; %bb.0: ; %entry
524; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
525; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
526; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
527; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
528; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
529; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
530; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
531; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
532; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
533; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
534; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
535; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
536; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
537; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
538; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
539; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
540; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
541; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
542; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
543; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
544; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
545; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
546; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
547; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
548; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
549; GFX1032-NEXT:    s_mov_b32 s2, -1
550; GFX1032-NEXT:    ; implicit-def: $vgpr0
551; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
552; GFX1032-NEXT:    s_cbranch_execz BB2_2
553; GFX1032-NEXT:  ; %bb.1:
554; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
555; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
556; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
557; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
558; GFX1032-NEXT:    ds_add_rtn_u32 v0, v0, v4
559; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
560; GFX1032-NEXT:    buffer_gl0_inv
561; GFX1032-NEXT:  BB2_2:
562; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
563; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
564; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
565; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
566; GFX1032-NEXT:    v_add_nc_u32_e32 v0, s3, v0
567; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
568; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
569; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
570; GFX1032-NEXT:    s_endpgm
571entry:
572  %lane = call i32 @llvm.amdgcn.workitem.id.x()
573  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel
574  store i32 %old, i32 addrspace(1)* %out
575  ret void
576}
577
578define amdgpu_kernel void @add_i32_varying_nouse() {
579; GFX7LESS-LABEL: add_i32_varying_nouse:
580; GFX7LESS:       ; %bb.0: ; %entry
581; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
582; GFX7LESS-NEXT:    s_mov_b32 m0, -1
583; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
584; GFX7LESS-NEXT:    ds_add_u32 v1, v0
585; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
586; GFX7LESS-NEXT:    s_endpgm
587;
588; GFX8-LABEL: add_i32_varying_nouse:
589; GFX8:       ; %bb.0: ; %entry
590; GFX8-NEXT:    v_mov_b32_e32 v1, v0
591; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
592; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
593; GFX8-NEXT:    s_not_b64 exec, exec
594; GFX8-NEXT:    v_mov_b32_e32 v1, 0
595; GFX8-NEXT:    s_not_b64 exec, exec
596; GFX8-NEXT:    s_or_saveexec_b64 s[0:1], -1
597; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
598; GFX8-NEXT:    s_nop 1
599; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
600; GFX8-NEXT:    s_nop 1
601; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
602; GFX8-NEXT:    s_nop 1
603; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
604; GFX8-NEXT:    s_nop 1
605; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
606; GFX8-NEXT:    s_nop 1
607; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
608; GFX8-NEXT:    v_readlane_b32 s2, v1, 63
609; GFX8-NEXT:    s_mov_b64 exec, s[0:1]
610; GFX8-NEXT:    s_mov_b32 s0, s2
611; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
612; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
613; GFX8-NEXT:    s_cbranch_execz BB3_2
614; GFX8-NEXT:  ; %bb.1:
615; GFX8-NEXT:    v_mov_b32_e32 v0, 0
616; GFX8-NEXT:    v_mov_b32_e32 v2, s0
617; GFX8-NEXT:    s_mov_b32 m0, -1
618; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
619; GFX8-NEXT:    ds_add_u32 v0, v2
620; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
621; GFX8-NEXT:  BB3_2:
622; GFX8-NEXT:    s_endpgm
623;
624; GFX9-LABEL: add_i32_varying_nouse:
625; GFX9:       ; %bb.0: ; %entry
626; GFX9-NEXT:    v_mov_b32_e32 v1, v0
627; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
628; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
629; GFX9-NEXT:    s_not_b64 exec, exec
630; GFX9-NEXT:    v_mov_b32_e32 v1, 0
631; GFX9-NEXT:    s_not_b64 exec, exec
632; GFX9-NEXT:    s_or_saveexec_b64 s[0:1], -1
633; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
634; GFX9-NEXT:    s_nop 1
635; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
636; GFX9-NEXT:    s_nop 1
637; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
638; GFX9-NEXT:    s_nop 1
639; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
640; GFX9-NEXT:    s_nop 1
641; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
642; GFX9-NEXT:    s_nop 1
643; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
644; GFX9-NEXT:    v_readlane_b32 s2, v1, 63
645; GFX9-NEXT:    s_mov_b64 exec, s[0:1]
646; GFX9-NEXT:    s_mov_b32 s0, s2
647; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
648; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
649; GFX9-NEXT:    s_cbranch_execz BB3_2
650; GFX9-NEXT:  ; %bb.1:
651; GFX9-NEXT:    v_mov_b32_e32 v0, 0
652; GFX9-NEXT:    v_mov_b32_e32 v2, s0
653; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
654; GFX9-NEXT:    ds_add_u32 v0, v2
655; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
656; GFX9-NEXT:  BB3_2:
657; GFX9-NEXT:    s_endpgm
658;
659; GFX1064-LABEL: add_i32_varying_nouse:
660; GFX1064:       ; %bb.0: ; %entry
661; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
662; GFX1064-NEXT:    s_not_b64 exec, exec
663; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
664; GFX1064-NEXT:    s_not_b64 exec, exec
665; GFX1064-NEXT:    s_or_saveexec_b64 s[0:1], -1
666; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
667; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
668; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
669; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
670; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
671; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
672; GFX1064-NEXT:    v_add_nc_u32_e32 v1, v1, v2
673; GFX1064-NEXT:    s_mov_b64 exec, s[0:1]
674; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
675; GFX1064-NEXT:    s_or_saveexec_b64 s[0:1], -1
676; GFX1064-NEXT:    v_readlane_b32 s2, v1, 0
677; GFX1064-NEXT:    v_readlane_b32 s3, v1, 32
678; GFX1064-NEXT:    s_mov_b64 exec, s[0:1]
679; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
680; GFX1064-NEXT:    s_add_i32 s0, s2, s3
681; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
682; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
683; GFX1064-NEXT:    s_cbranch_execz BB3_2
684; GFX1064-NEXT:  ; %bb.1:
685; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
686; GFX1064-NEXT:    v_mov_b32_e32 v3, s0
687; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
688; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
689; GFX1064-NEXT:    ds_add_u32 v0, v3
690; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
691; GFX1064-NEXT:    buffer_gl0_inv
692; GFX1064-NEXT:  BB3_2:
693; GFX1064-NEXT:    s_endpgm
694;
695; GFX1032-LABEL: add_i32_varying_nouse:
696; GFX1032:       ; %bb.0: ; %entry
697; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
698; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
699; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
700; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
701; GFX1032-NEXT:    s_or_saveexec_b32 s0, -1
702; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
703; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
704; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
705; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
706; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
707; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
708; GFX1032-NEXT:    v_add_nc_u32_e32 v1, v1, v2
709; GFX1032-NEXT:    s_mov_b32 exec_lo, s0
710; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
711; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
712; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
713; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
714; GFX1032-NEXT:    s_cbranch_execz BB3_2
715; GFX1032-NEXT:  ; %bb.1:
716; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
717; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
718; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
719; GFX1032-NEXT:    ds_add_u32 v3, v0
720; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
721; GFX1032-NEXT:    buffer_gl0_inv
722; GFX1032-NEXT:  BB3_2:
723; GFX1032-NEXT:    s_endpgm
724entry:
725  %lane = call i32 @llvm.amdgcn.workitem.id.x()
726  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel
727  ret void
728}
729
730define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) {
731;
732;
733; GFX7LESS-LABEL: add_i64_constant:
734; GFX7LESS:       ; %bb.0: ; %entry
735; GFX7LESS-NEXT:    s_mov_b64 s[4:5], exec
736; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
737; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s4, 0
738; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s5, v0
739; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
740; GFX7LESS-NEXT:    ; implicit-def: $vgpr1_vgpr2
741; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
742; GFX7LESS-NEXT:    s_cbranch_execz BB4_2
743; GFX7LESS-NEXT:  ; %bb.1:
744; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
745; GFX7LESS-NEXT:    s_mul_i32 s4, s4, 5
746; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0
747; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s4
748; GFX7LESS-NEXT:    s_mov_b32 m0, -1
749; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
750; GFX7LESS-NEXT:    ds_add_rtn_u64 v[1:2], v2, v[1:2]
751; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
752; GFX7LESS-NEXT:  BB4_2:
753; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
754; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
755; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v1
756; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v2
757; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v0
758; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
759; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
760; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
761; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
762; GFX7LESS-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
763; GFX7LESS-NEXT:    s_mov_b32 s2, -1
764; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
765; GFX7LESS-NEXT:    s_endpgm
766;
767; GFX8-LABEL: add_i64_constant:
768; GFX8:       ; %bb.0: ; %entry
769; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
770; GFX8-NEXT:    s_mov_b64 s[4:5], exec
771; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
772; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
773; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
774; GFX8-NEXT:    ; implicit-def: $vgpr1_vgpr2
775; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
776; GFX8-NEXT:    s_cbranch_execz BB4_2
777; GFX8-NEXT:  ; %bb.1:
778; GFX8-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
779; GFX8-NEXT:    s_mul_i32 s4, s4, 5
780; GFX8-NEXT:    v_mov_b32_e32 v1, s4
781; GFX8-NEXT:    v_mov_b32_e32 v2, 0
782; GFX8-NEXT:    s_mov_b32 m0, -1
783; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
784; GFX8-NEXT:    ds_add_rtn_u64 v[1:2], v2, v[1:2]
785; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
786; GFX8-NEXT:  BB4_2:
787; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
788; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
789; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
790; GFX8-NEXT:    v_readfirstlane_b32 s3, v2
791; GFX8-NEXT:    v_mov_b32_e32 v1, s2
792; GFX8-NEXT:    v_mov_b32_e32 v2, s3
793; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v0, 5, v[1:2]
794; GFX8-NEXT:    s_mov_b32 s3, 0xf000
795; GFX8-NEXT:    s_mov_b32 s2, -1
796; GFX8-NEXT:    s_nop 2
797; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
798; GFX8-NEXT:    s_endpgm
799;
800; GFX9-LABEL: add_i64_constant:
801; GFX9:       ; %bb.0: ; %entry
802; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
803; GFX9-NEXT:    s_mov_b64 s[4:5], exec
804; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
805; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
806; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
807; GFX9-NEXT:    ; implicit-def: $vgpr1_vgpr2
808; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
809; GFX9-NEXT:    s_cbranch_execz BB4_2
810; GFX9-NEXT:  ; %bb.1:
811; GFX9-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
812; GFX9-NEXT:    s_mul_i32 s4, s4, 5
813; GFX9-NEXT:    v_mov_b32_e32 v1, s4
814; GFX9-NEXT:    v_mov_b32_e32 v2, 0
815; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
816; GFX9-NEXT:    ds_add_rtn_u64 v[1:2], v2, v[1:2]
817; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
818; GFX9-NEXT:  BB4_2:
819; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
820; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
821; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
822; GFX9-NEXT:    v_readfirstlane_b32 s3, v2
823; GFX9-NEXT:    v_mov_b32_e32 v1, s2
824; GFX9-NEXT:    v_mov_b32_e32 v2, s3
825; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v0, 5, v[1:2]
826; GFX9-NEXT:    s_mov_b32 s3, 0xf000
827; GFX9-NEXT:    s_mov_b32 s2, -1
828; GFX9-NEXT:    s_nop 2
829; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
830; GFX9-NEXT:    s_endpgm
831;
832; GFX1064-LABEL: add_i64_constant:
833; GFX1064:       ; %bb.0: ; %entry
834; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
835; GFX1064-NEXT:    s_mov_b64 s[4:5], exec
836; GFX1064-NEXT:    ; implicit-def: $vgpr1_vgpr2
837; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
838; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
839; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
840; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
841; GFX1064-NEXT:    s_cbranch_execz BB4_2
842; GFX1064-NEXT:  ; %bb.1:
843; GFX1064-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
844; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
845; GFX1064-NEXT:    s_mul_i32 s4, s4, 5
846; GFX1064-NEXT:    v_mov_b32_e32 v1, s4
847; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
848; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
849; GFX1064-NEXT:    ds_add_rtn_u64 v[1:2], v2, v[1:2]
850; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
851; GFX1064-NEXT:    buffer_gl0_inv
852; GFX1064-NEXT:  BB4_2:
853; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
854; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
855; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
856; GFX1064-NEXT:    v_readfirstlane_b32 s3, v2
857; GFX1064-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v0, 5, s[2:3]
858; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
859; GFX1064-NEXT:    s_mov_b32 s2, -1
860; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
861; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
862; GFX1064-NEXT:    s_endpgm
863;
864; GFX1032-LABEL: add_i64_constant:
865; GFX1032:       ; %bb.0: ; %entry
866; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
867; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
868; GFX1032-NEXT:    ; implicit-def: $vgpr1_vgpr2
869; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
870; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
871; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
872; GFX1032-NEXT:    s_cbranch_execz BB4_2
873; GFX1032-NEXT:  ; %bb.1:
874; GFX1032-NEXT:    s_bcnt1_i32_b32 s3, s3
875; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
876; GFX1032-NEXT:    s_mul_i32 s3, s3, 5
877; GFX1032-NEXT:    v_mov_b32_e32 v1, s3
878; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
879; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
880; GFX1032-NEXT:    ds_add_rtn_u64 v[1:2], v2, v[1:2]
881; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
882; GFX1032-NEXT:    buffer_gl0_inv
883; GFX1032-NEXT:  BB4_2:
884; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
885; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
886; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
887; GFX1032-NEXT:    v_readfirstlane_b32 s3, v2
888; GFX1032-NEXT:    v_mad_u64_u32 v[0:1], s2, v0, 5, s[2:3]
889; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
890; GFX1032-NEXT:    s_mov_b32 s2, -1
891; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
892; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
893; GFX1032-NEXT:    s_endpgm
894entry:
895  %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 5 acq_rel
896  store i64 %old, i64 addrspace(1)* %out
897  ret void
898}
899
900define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive) {
901;
902;
903; GFX7LESS-LABEL: add_i64_uniform:
904; GFX7LESS:       ; %bb.0: ; %entry
905; GFX7LESS-NEXT:    s_mov_b64 s[6:7], exec
906; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
907; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
908; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s7, v0
909; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
910; GFX7LESS-NEXT:    ; implicit-def: $vgpr1_vgpr2
911; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
912; GFX7LESS-NEXT:    s_cbranch_execz BB5_2
913; GFX7LESS-NEXT:  ; %bb.1:
914; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
915; GFX7LESS-NEXT:    v_mov_b32_e32 v3, 0
916; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
917; GFX7LESS-NEXT:    s_mul_i32 s7, s3, s6
918; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
919; GFX7LESS-NEXT:    v_mul_hi_u32 v1, s2, v1
920; GFX7LESS-NEXT:    s_mul_i32 s6, s2, s6
921; GFX7LESS-NEXT:    v_add_i32_e32 v2, vcc, s7, v1
922; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
923; GFX7LESS-NEXT:    s_mov_b32 m0, -1
924; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
925; GFX7LESS-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
926; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
927; GFX7LESS-NEXT:  BB5_2:
928; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
929; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
930; GFX7LESS-NEXT:    s_mov_b32 s6, -1
931; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
932; GFX7LESS-NEXT:    s_mov_b32 s4, s0
933; GFX7LESS-NEXT:    s_mov_b32 s5, s1
934; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v1
935; GFX7LESS-NEXT:    v_readfirstlane_b32 s1, v2
936; GFX7LESS-NEXT:    v_mul_lo_u32 v1, s3, v0
937; GFX7LESS-NEXT:    v_mul_hi_u32 v2, s2, v0
938; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s2, v0
939; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
940; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s1
941; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
942; GFX7LESS-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
943; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
944; GFX7LESS-NEXT:    s_endpgm
945;
946; GFX8-LABEL: add_i64_uniform:
947; GFX8:       ; %bb.0: ; %entry
948; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
949; GFX8-NEXT:    s_mov_b64 s[6:7], exec
950; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
951; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
952; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
953; GFX8-NEXT:    ; implicit-def: $vgpr1_vgpr2
954; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
955; GFX8-NEXT:    s_cbranch_execz BB5_2
956; GFX8-NEXT:  ; %bb.1:
957; GFX8-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
958; GFX8-NEXT:    v_mov_b32_e32 v1, s6
959; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
960; GFX8-NEXT:    v_mul_hi_u32 v1, s2, v1
961; GFX8-NEXT:    s_mul_i32 s7, s3, s6
962; GFX8-NEXT:    s_mul_i32 s6, s2, s6
963; GFX8-NEXT:    v_mov_b32_e32 v3, 0
964; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s7, v1
965; GFX8-NEXT:    v_mov_b32_e32 v1, s6
966; GFX8-NEXT:    s_mov_b32 m0, -1
967; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
968; GFX8-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
969; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
970; GFX8-NEXT:  BB5_2:
971; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
972; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
973; GFX8-NEXT:    s_mov_b32 s4, s0
974; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
975; GFX8-NEXT:    v_mul_lo_u32 v1, s3, v0
976; GFX8-NEXT:    v_mul_hi_u32 v3, s2, v0
977; GFX8-NEXT:    v_mul_lo_u32 v0, s2, v0
978; GFX8-NEXT:    s_mov_b32 s5, s1
979; GFX8-NEXT:    v_readfirstlane_b32 s1, v2
980; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
981; GFX8-NEXT:    v_mov_b32_e32 v2, s1
982; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
983; GFX8-NEXT:    s_mov_b32 s7, 0xf000
984; GFX8-NEXT:    s_mov_b32 s6, -1
985; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
986; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
987; GFX8-NEXT:    s_endpgm
988;
989; GFX9-LABEL: add_i64_uniform:
990; GFX9:       ; %bb.0: ; %entry
991; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
992; GFX9-NEXT:    s_mov_b64 s[6:7], exec
993; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
994; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
995; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
996; GFX9-NEXT:    ; implicit-def: $vgpr1_vgpr2
997; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
998; GFX9-NEXT:    s_cbranch_execz BB5_2
999; GFX9-NEXT:  ; %bb.1:
1000; GFX9-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1001; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1002; GFX9-NEXT:    s_mul_i32 s7, s3, s6
1003; GFX9-NEXT:    s_mul_hi_u32 s8, s2, s6
1004; GFX9-NEXT:    s_add_i32 s8, s8, s7
1005; GFX9-NEXT:    s_mul_i32 s6, s2, s6
1006; GFX9-NEXT:    v_mov_b32_e32 v1, s6
1007; GFX9-NEXT:    v_mov_b32_e32 v2, s8
1008; GFX9-NEXT:    v_mov_b32_e32 v3, 0
1009; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1010; GFX9-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1011; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1012; GFX9-NEXT:  BB5_2:
1013; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
1014; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1015; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v0
1016; GFX9-NEXT:    v_mul_hi_u32 v4, s2, v0
1017; GFX9-NEXT:    v_mul_lo_u32 v0, s2, v0
1018; GFX9-NEXT:    s_mov_b32 s4, s0
1019; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
1020; GFX9-NEXT:    s_mov_b32 s5, s1
1021; GFX9-NEXT:    v_readfirstlane_b32 s1, v2
1022; GFX9-NEXT:    v_add_u32_e32 v1, v4, v3
1023; GFX9-NEXT:    v_mov_b32_e32 v2, s1
1024; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
1025; GFX9-NEXT:    s_mov_b32 s7, 0xf000
1026; GFX9-NEXT:    s_mov_b32 s6, -1
1027; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
1028; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1029; GFX9-NEXT:    s_endpgm
1030;
1031; GFX1064-LABEL: add_i64_uniform:
1032; GFX1064:       ; %bb.0: ; %entry
1033; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1034; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
1035; GFX1064-NEXT:    ; implicit-def: $vgpr1_vgpr2
1036; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1037; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
1038; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1039; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1040; GFX1064-NEXT:    s_cbranch_execz BB5_2
1041; GFX1064-NEXT:  ; %bb.1:
1042; GFX1064-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1043; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
1044; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1045; GFX1064-NEXT:    s_mul_i32 s7, s3, s6
1046; GFX1064-NEXT:    s_mul_hi_u32 s8, s2, s6
1047; GFX1064-NEXT:    s_mul_i32 s6, s2, s6
1048; GFX1064-NEXT:    s_add_i32 s8, s8, s7
1049; GFX1064-NEXT:    v_mov_b32_e32 v1, s6
1050; GFX1064-NEXT:    v_mov_b32_e32 v2, s8
1051; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1052; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1053; GFX1064-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1054; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1055; GFX1064-NEXT:    buffer_gl0_inv
1056; GFX1064-NEXT:  BB5_2:
1057; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
1058; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
1059; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1060; GFX1064-NEXT:    v_mul_lo_u32 v3, s3, v0
1061; GFX1064-NEXT:    v_mul_hi_u32 v4, s2, v0
1062; GFX1064-NEXT:    v_mul_lo_u32 v0, s2, v0
1063; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
1064; GFX1064-NEXT:    v_readfirstlane_b32 s4, v2
1065; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
1066; GFX1064-NEXT:    v_add_nc_u32_e32 v1, v4, v3
1067; GFX1064-NEXT:    v_add_co_u32 v0, vcc, s2, v0
1068; GFX1064-NEXT:    s_mov_b32 s2, -1
1069; GFX1064-NEXT:    v_add_co_ci_u32_e32 v1, vcc, s4, v1, vcc
1070; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1071; GFX1064-NEXT:    s_endpgm
1072;
1073; GFX1032-LABEL: add_i64_uniform:
1074; GFX1032:       ; %bb.0: ; %entry
1075; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1076; GFX1032-NEXT:    s_mov_b32 s5, exec_lo
1077; GFX1032-NEXT:    ; implicit-def: $vgpr1_vgpr2
1078; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s5, 0
1079; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1080; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
1081; GFX1032-NEXT:    s_cbranch_execz BB5_2
1082; GFX1032-NEXT:  ; %bb.1:
1083; GFX1032-NEXT:    s_bcnt1_i32_b32 s5, s5
1084; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
1085; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1086; GFX1032-NEXT:    s_mul_i32 s6, s3, s5
1087; GFX1032-NEXT:    s_mul_hi_u32 s7, s2, s5
1088; GFX1032-NEXT:    s_mul_i32 s5, s2, s5
1089; GFX1032-NEXT:    s_add_i32 s7, s7, s6
1090; GFX1032-NEXT:    v_mov_b32_e32 v1, s5
1091; GFX1032-NEXT:    v_mov_b32_e32 v2, s7
1092; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1093; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1094; GFX1032-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1095; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1096; GFX1032-NEXT:    buffer_gl0_inv
1097; GFX1032-NEXT:  BB5_2:
1098; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
1099; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1100; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1101; GFX1032-NEXT:    v_mul_lo_u32 v3, s3, v0
1102; GFX1032-NEXT:    v_mul_hi_u32 v4, s2, v0
1103; GFX1032-NEXT:    v_mul_lo_u32 v0, s2, v0
1104; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
1105; GFX1032-NEXT:    v_readfirstlane_b32 s4, v2
1106; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1107; GFX1032-NEXT:    v_add_nc_u32_e32 v1, v4, v3
1108; GFX1032-NEXT:    v_add_co_u32 v0, vcc_lo, s2, v0
1109; GFX1032-NEXT:    s_mov_b32 s2, -1
1110; GFX1032-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo
1111; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1112; GFX1032-NEXT:    s_endpgm
1113entry:
1114  %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %additive acq_rel
1115  store i64 %old, i64 addrspace(1)* %out
1116  ret void
1117}
1118
1119define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out) {
1120;
1121;
1122; GFX7LESS-LABEL: add_i64_varying:
1123; GFX7LESS:       ; %bb.0: ; %entry
1124; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1125; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
1126; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1127; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1128; GFX7LESS-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1129; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1130; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
1131; GFX7LESS-NEXT:    s_mov_b32 s2, -1
1132; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1133; GFX7LESS-NEXT:    s_endpgm
1134;
1135; GFX8-LABEL: add_i64_varying:
1136; GFX8:       ; %bb.0: ; %entry
1137; GFX8-NEXT:    v_mov_b32_e32 v1, 0
1138; GFX8-NEXT:    s_mov_b32 m0, -1
1139; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1140; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1141; GFX8-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1142; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1143; GFX8-NEXT:    s_mov_b32 s3, 0xf000
1144; GFX8-NEXT:    s_mov_b32 s2, -1
1145; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1146; GFX8-NEXT:    s_endpgm
1147;
1148; GFX9-LABEL: add_i64_varying:
1149; GFX9:       ; %bb.0: ; %entry
1150; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1151; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1152; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1153; GFX9-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1154; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1155; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1156; GFX9-NEXT:    s_mov_b32 s2, -1
1157; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1158; GFX9-NEXT:    s_endpgm
1159;
1160; GFX10-LABEL: add_i64_varying:
1161; GFX10:       ; %bb.0: ; %entry
1162; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1163; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1164; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
1165; GFX10-NEXT:    s_mov_b32 s2, -1
1166; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1167; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1168; GFX10-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1169; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1170; GFX10-NEXT:    buffer_gl0_inv
1171; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1172; GFX10-NEXT:    s_endpgm
1173entry:
1174  %lane = call i32 @llvm.amdgcn.workitem.id.x()
1175  %zext = zext i32 %lane to i64
1176  %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %zext acq_rel
1177  store i64 %old, i64 addrspace(1)* %out
1178  ret void
1179}
1180
1181define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) {
1182;
1183;
1184; GFX7LESS-LABEL: sub_i32_constant:
1185; GFX7LESS:       ; %bb.0: ; %entry
1186; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
1187; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1188; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
1189; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
1190; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1191; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
1192; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1193; GFX7LESS-NEXT:    s_cbranch_execz BB7_2
1194; GFX7LESS-NEXT:  ; %bb.1:
1195; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1196; GFX7LESS-NEXT:    s_mul_i32 s2, s2, 5
1197; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
1198; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s2
1199; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1200; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1201; GFX7LESS-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1202; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1203; GFX7LESS-NEXT:  BB7_2:
1204; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
1205; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1206; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v1
1207; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1208; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
1209; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
1210; GFX7LESS-NEXT:    s_mov_b32 s2, -1
1211; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1212; GFX7LESS-NEXT:    s_endpgm
1213;
1214; GFX8-LABEL: sub_i32_constant:
1215; GFX8:       ; %bb.0: ; %entry
1216; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1217; GFX8-NEXT:    s_mov_b64 s[2:3], exec
1218; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1219; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1220; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1221; GFX8-NEXT:    ; implicit-def: $vgpr1
1222; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1223; GFX8-NEXT:    s_cbranch_execz BB7_2
1224; GFX8-NEXT:  ; %bb.1:
1225; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1226; GFX8-NEXT:    s_mul_i32 s2, s2, 5
1227; GFX8-NEXT:    v_mov_b32_e32 v1, 0
1228; GFX8-NEXT:    v_mov_b32_e32 v2, s2
1229; GFX8-NEXT:    s_mov_b32 m0, -1
1230; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1231; GFX8-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1232; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1233; GFX8-NEXT:  BB7_2:
1234; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1235; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1236; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
1237; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1238; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s2, v0
1239; GFX8-NEXT:    s_mov_b32 s3, 0xf000
1240; GFX8-NEXT:    s_mov_b32 s2, -1
1241; GFX8-NEXT:    s_nop 0
1242; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1243; GFX8-NEXT:    s_endpgm
1244;
1245; GFX9-LABEL: sub_i32_constant:
1246; GFX9:       ; %bb.0: ; %entry
1247; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1248; GFX9-NEXT:    s_mov_b64 s[2:3], exec
1249; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1250; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1251; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1252; GFX9-NEXT:    ; implicit-def: $vgpr1
1253; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1254; GFX9-NEXT:    s_cbranch_execz BB7_2
1255; GFX9-NEXT:  ; %bb.1:
1256; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1257; GFX9-NEXT:    s_mul_i32 s2, s2, 5
1258; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1259; GFX9-NEXT:    v_mov_b32_e32 v2, s2
1260; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1261; GFX9-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1262; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1263; GFX9-NEXT:  BB7_2:
1264; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
1265; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1266; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
1267; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1268; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
1269; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1270; GFX9-NEXT:    s_mov_b32 s2, -1
1271; GFX9-NEXT:    s_nop 0
1272; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1273; GFX9-NEXT:    s_endpgm
1274;
1275; GFX1064-LABEL: sub_i32_constant:
1276; GFX1064:       ; %bb.0: ; %entry
1277; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1278; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
1279; GFX1064-NEXT:    ; implicit-def: $vgpr1
1280; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1281; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1282; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1283; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1284; GFX1064-NEXT:    s_cbranch_execz BB7_2
1285; GFX1064-NEXT:  ; %bb.1:
1286; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1287; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
1288; GFX1064-NEXT:    s_mul_i32 s2, s2, 5
1289; GFX1064-NEXT:    v_mov_b32_e32 v2, s2
1290; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1291; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1292; GFX1064-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1293; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1294; GFX1064-NEXT:    buffer_gl0_inv
1295; GFX1064-NEXT:  BB7_2:
1296; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
1297; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
1298; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
1299; GFX1064-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1300; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
1301; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1302; GFX1064-NEXT:    s_mov_b32 s2, -1
1303; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1304; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1305; GFX1064-NEXT:    s_endpgm
1306;
1307; GFX1032-LABEL: sub_i32_constant:
1308; GFX1032:       ; %bb.0: ; %entry
1309; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1310; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
1311; GFX1032-NEXT:    ; implicit-def: $vgpr1
1312; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
1313; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1314; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
1315; GFX1032-NEXT:    s_cbranch_execz BB7_2
1316; GFX1032-NEXT:  ; %bb.1:
1317; GFX1032-NEXT:    s_bcnt1_i32_b32 s3, s3
1318; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
1319; GFX1032-NEXT:    s_mul_i32 s3, s3, 5
1320; GFX1032-NEXT:    v_mov_b32_e32 v2, s3
1321; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1322; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1323; GFX1032-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1324; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1325; GFX1032-NEXT:    buffer_gl0_inv
1326; GFX1032-NEXT:  BB7_2:
1327; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
1328; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
1329; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
1330; GFX1032-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1331; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1332; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1333; GFX1032-NEXT:    s_mov_b32 s2, -1
1334; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1335; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1336; GFX1032-NEXT:    s_endpgm
1337entry:
1338  %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 5 acq_rel
1339  store i32 %old, i32 addrspace(1)* %out
1340  ret void
1341}
1342
1343define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive) {
1344;
1345;
1346; GFX7LESS-LABEL: sub_i32_uniform:
1347; GFX7LESS:       ; %bb.0: ; %entry
1348; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
1349; GFX7LESS-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1350; GFX7LESS-NEXT:    s_load_dword s0, s[0:1], 0xb
1351; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
1352; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
1353; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1354; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
1355; GFX7LESS-NEXT:    s_and_saveexec_b64 s[6:7], vcc
1356; GFX7LESS-NEXT:    s_cbranch_execz BB8_2
1357; GFX7LESS-NEXT:  ; %bb.1:
1358; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
1359; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1360; GFX7LESS-NEXT:    s_mul_i32 s1, s0, s1
1361; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
1362; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s1
1363; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1364; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1365; GFX7LESS-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1366; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1367; GFX7LESS-NEXT:  BB8_2:
1368; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[6:7]
1369; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1370; GFX7LESS-NEXT:    v_readfirstlane_b32 s1, v1
1371; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s0, v0
1372; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
1373; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s1, v0
1374; GFX7LESS-NEXT:    s_mov_b32 s6, -1
1375; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1376; GFX7LESS-NEXT:    s_endpgm
1377;
1378; GFX8-LABEL: sub_i32_uniform:
1379; GFX8:       ; %bb.0: ; %entry
1380; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1381; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x2c
1382; GFX8-NEXT:    s_mov_b64 s[2:3], exec
1383; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1384; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1385; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1386; GFX8-NEXT:    ; implicit-def: $vgpr1
1387; GFX8-NEXT:    s_and_saveexec_b64 s[6:7], vcc
1388; GFX8-NEXT:    s_cbranch_execz BB8_2
1389; GFX8-NEXT:  ; %bb.1:
1390; GFX8-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
1391; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1392; GFX8-NEXT:    s_mul_i32 s1, s0, s1
1393; GFX8-NEXT:    v_mov_b32_e32 v1, 0
1394; GFX8-NEXT:    v_mov_b32_e32 v2, s1
1395; GFX8-NEXT:    s_mov_b32 m0, -1
1396; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1397; GFX8-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1398; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1399; GFX8-NEXT:  BB8_2:
1400; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
1401; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1402; GFX8-NEXT:    v_mul_lo_u32 v0, s0, v0
1403; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
1404; GFX8-NEXT:    s_mov_b32 s7, 0xf000
1405; GFX8-NEXT:    s_mov_b32 s6, -1
1406; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s0, v0
1407; GFX8-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1408; GFX8-NEXT:    s_endpgm
1409;
1410; GFX9-LABEL: sub_i32_uniform:
1411; GFX9:       ; %bb.0: ; %entry
1412; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1413; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
1414; GFX9-NEXT:    s_mov_b64 s[6:7], exec
1415; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1416; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
1417; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1418; GFX9-NEXT:    ; implicit-def: $vgpr1
1419; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1420; GFX9-NEXT:    s_cbranch_execz BB8_2
1421; GFX9-NEXT:  ; %bb.1:
1422; GFX9-NEXT:    s_bcnt1_i32_b64 s3, s[6:7]
1423; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1424; GFX9-NEXT:    s_mul_i32 s3, s2, s3
1425; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1426; GFX9-NEXT:    v_mov_b32_e32 v2, s3
1427; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1428; GFX9-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1429; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1430; GFX9-NEXT:  BB8_2:
1431; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
1432; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1433; GFX9-NEXT:    v_mul_lo_u32 v0, s2, v0
1434; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
1435; GFX9-NEXT:    s_mov_b32 s7, 0xf000
1436; GFX9-NEXT:    s_mov_b32 s6, -1
1437; GFX9-NEXT:    v_sub_u32_e32 v0, s0, v0
1438; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1439; GFX9-NEXT:    s_endpgm
1440;
1441; GFX1064-LABEL: sub_i32_uniform:
1442; GFX1064:       ; %bb.0: ; %entry
1443; GFX1064-NEXT:    s_clause 0x1
1444; GFX1064-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1445; GFX1064-NEXT:    s_load_dword s2, s[0:1], 0x2c
1446; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
1447; GFX1064-NEXT:    ; implicit-def: $vgpr1
1448; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1449; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
1450; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1451; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1452; GFX1064-NEXT:    s_cbranch_execz BB8_2
1453; GFX1064-NEXT:  ; %bb.1:
1454; GFX1064-NEXT:    s_bcnt1_i32_b64 s3, s[6:7]
1455; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
1456; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1457; GFX1064-NEXT:    s_mul_i32 s3, s2, s3
1458; GFX1064-NEXT:    v_mov_b32_e32 v2, s3
1459; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1460; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1461; GFX1064-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1462; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1463; GFX1064-NEXT:    buffer_gl0_inv
1464; GFX1064-NEXT:  BB8_2:
1465; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
1466; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
1467; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1468; GFX1064-NEXT:    v_mul_lo_u32 v0, s2, v0
1469; GFX1064-NEXT:    v_readfirstlane_b32 s0, v1
1470; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
1471; GFX1064-NEXT:    s_mov_b32 s6, -1
1472; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1473; GFX1064-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1474; GFX1064-NEXT:    s_endpgm
1475;
1476; GFX1032-LABEL: sub_i32_uniform:
1477; GFX1032:       ; %bb.0: ; %entry
1478; GFX1032-NEXT:    s_clause 0x1
1479; GFX1032-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1480; GFX1032-NEXT:    s_load_dword s2, s[0:1], 0x2c
1481; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
1482; GFX1032-NEXT:    ; implicit-def: $vgpr1
1483; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
1484; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1485; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
1486; GFX1032-NEXT:    s_cbranch_execz BB8_2
1487; GFX1032-NEXT:  ; %bb.1:
1488; GFX1032-NEXT:    s_bcnt1_i32_b32 s1, s3
1489; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
1490; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1491; GFX1032-NEXT:    s_mul_i32 s1, s2, s1
1492; GFX1032-NEXT:    v_mov_b32_e32 v2, s1
1493; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1494; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1495; GFX1032-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1496; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1497; GFX1032-NEXT:    buffer_gl0_inv
1498; GFX1032-NEXT:  BB8_2:
1499; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
1500; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s0
1501; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1502; GFX1032-NEXT:    v_mul_lo_u32 v0, s2, v0
1503; GFX1032-NEXT:    v_readfirstlane_b32 s0, v1
1504; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
1505; GFX1032-NEXT:    s_mov_b32 s6, -1
1506; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1507; GFX1032-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1508; GFX1032-NEXT:    s_endpgm
1509entry:
1510  %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %subitive acq_rel
1511  store i32 %old, i32 addrspace(1)* %out
1512  ret void
1513}
1514
1515define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
1516;
1517;
1518; GFX7LESS-LABEL: sub_i32_varying:
1519; GFX7LESS:       ; %bb.0: ; %entry
1520; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1521; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
1522; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1523; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1524; GFX7LESS-NEXT:    ds_sub_rtn_u32 v0, v1, v0
1525; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1526; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
1527; GFX7LESS-NEXT:    s_mov_b32 s2, -1
1528; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1529; GFX7LESS-NEXT:    s_endpgm
1530;
1531; GFX8-LABEL: sub_i32_varying:
1532; GFX8:       ; %bb.0: ; %entry
1533; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1534; GFX8-NEXT:    v_mov_b32_e32 v2, v0
1535; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
1536; GFX8-NEXT:    v_mov_b32_e32 v1, 0
1537; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
1538; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1539; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
1540; GFX8-NEXT:    s_not_b64 exec, exec
1541; GFX8-NEXT:    v_mov_b32_e32 v2, 0
1542; GFX8-NEXT:    s_not_b64 exec, exec
1543; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
1544; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1545; GFX8-NEXT:    s_nop 1
1546; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1547; GFX8-NEXT:    s_nop 1
1548; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1549; GFX8-NEXT:    s_nop 1
1550; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1551; GFX8-NEXT:    s_nop 1
1552; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
1553; GFX8-NEXT:    s_nop 1
1554; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
1555; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
1556; GFX8-NEXT:    s_nop 0
1557; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
1558; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
1559; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1560; GFX8-NEXT:    ; implicit-def: $vgpr0
1561; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1562; GFX8-NEXT:    s_cbranch_execz BB9_2
1563; GFX8-NEXT:  ; %bb.1:
1564; GFX8-NEXT:    v_mov_b32_e32 v0, 0
1565; GFX8-NEXT:    v_mov_b32_e32 v3, s4
1566; GFX8-NEXT:    s_mov_b32 m0, -1
1567; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1568; GFX8-NEXT:    ds_sub_rtn_u32 v0, v0, v3
1569; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1570; GFX8-NEXT:  BB9_2:
1571; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
1572; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1573; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
1574; GFX8-NEXT:    v_mov_b32_e32 v0, v1
1575; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s2, v0
1576; GFX8-NEXT:    s_mov_b32 s3, 0xf000
1577; GFX8-NEXT:    s_mov_b32 s2, -1
1578; GFX8-NEXT:    s_nop 0
1579; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1580; GFX8-NEXT:    s_endpgm
1581;
1582; GFX9-LABEL: sub_i32_varying:
1583; GFX9:       ; %bb.0: ; %entry
1584; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1585; GFX9-NEXT:    v_mov_b32_e32 v2, v0
1586; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
1587; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1588; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
1589; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1590; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
1591; GFX9-NEXT:    s_not_b64 exec, exec
1592; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1593; GFX9-NEXT:    s_not_b64 exec, exec
1594; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
1595; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1596; GFX9-NEXT:    s_nop 1
1597; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1598; GFX9-NEXT:    s_nop 1
1599; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1600; GFX9-NEXT:    s_nop 1
1601; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1602; GFX9-NEXT:    s_nop 1
1603; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
1604; GFX9-NEXT:    s_nop 1
1605; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
1606; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
1607; GFX9-NEXT:    s_nop 0
1608; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
1609; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
1610; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1611; GFX9-NEXT:    ; implicit-def: $vgpr0
1612; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1613; GFX9-NEXT:    s_cbranch_execz BB9_2
1614; GFX9-NEXT:  ; %bb.1:
1615; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1616; GFX9-NEXT:    v_mov_b32_e32 v3, s4
1617; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1618; GFX9-NEXT:    ds_sub_rtn_u32 v0, v0, v3
1619; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1620; GFX9-NEXT:  BB9_2:
1621; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
1622; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1623; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
1624; GFX9-NEXT:    v_mov_b32_e32 v0, v1
1625; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
1626; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1627; GFX9-NEXT:    s_mov_b32 s2, -1
1628; GFX9-NEXT:    s_nop 0
1629; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1630; GFX9-NEXT:    s_endpgm
1631;
1632; GFX1064-LABEL: sub_i32_varying:
1633; GFX1064:       ; %bb.0: ; %entry
1634; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
1635; GFX1064-NEXT:    s_not_b64 exec, exec
1636; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
1637; GFX1064-NEXT:    s_not_b64 exec, exec
1638; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
1639; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1640; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
1641; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1642; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1643; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1644; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
1645; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
1646; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
1647; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
1648; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
1649; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
1650; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
1651; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
1652; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
1653; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1654; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
1655; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
1656; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
1657; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
1658; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1659; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
1660; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
1661; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
1662; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
1663; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
1664; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
1665; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
1666; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
1667; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
1668; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1669; GFX1064-NEXT:    s_mov_b32 s2, -1
1670; GFX1064-NEXT:    ; implicit-def: $vgpr0
1671; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1672; GFX1064-NEXT:    s_cbranch_execz BB9_2
1673; GFX1064-NEXT:  ; %bb.1:
1674; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
1675; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
1676; GFX1064-NEXT:    s_mov_b32 s3, s7
1677; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1678; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1679; GFX1064-NEXT:    ds_sub_rtn_u32 v0, v0, v4
1680; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1681; GFX1064-NEXT:    buffer_gl0_inv
1682; GFX1064-NEXT:  BB9_2:
1683; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
1684; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
1685; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
1686; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
1687; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s3, v0
1688; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
1689; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1690; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1691; GFX1064-NEXT:    s_endpgm
1692;
1693; GFX1032-LABEL: sub_i32_varying:
1694; GFX1032:       ; %bb.0: ; %entry
1695; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
1696; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
1697; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
1698; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
1699; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
1700; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1701; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1702; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1703; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1704; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
1705; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
1706; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
1707; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1708; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
1709; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
1710; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
1711; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
1712; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
1713; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
1714; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
1715; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1716; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
1717; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
1718; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
1719; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1720; GFX1032-NEXT:    s_mov_b32 s2, -1
1721; GFX1032-NEXT:    ; implicit-def: $vgpr0
1722; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
1723; GFX1032-NEXT:    s_cbranch_execz BB9_2
1724; GFX1032-NEXT:  ; %bb.1:
1725; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
1726; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
1727; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1728; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1729; GFX1032-NEXT:    ds_sub_rtn_u32 v0, v0, v4
1730; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1731; GFX1032-NEXT:    buffer_gl0_inv
1732; GFX1032-NEXT:  BB9_2:
1733; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
1734; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
1735; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
1736; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
1737; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, s3, v0
1738; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1739; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1740; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1741; GFX1032-NEXT:    s_endpgm
1742entry:
1743  %lane = call i32 @llvm.amdgcn.workitem.id.x()
1744  %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %lane acq_rel
1745  store i32 %old, i32 addrspace(1)* %out
1746  ret void
1747}
1748
1749define amdgpu_kernel void @sub_i32_varying_nouse() {
1750; GFX7LESS-LABEL: sub_i32_varying_nouse:
1751; GFX7LESS:       ; %bb.0: ; %entry
1752; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
1753; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1754; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1755; GFX7LESS-NEXT:    ds_sub_u32 v1, v0
1756; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1757; GFX7LESS-NEXT:    s_endpgm
1758;
1759; GFX8-LABEL: sub_i32_varying_nouse:
1760; GFX8:       ; %bb.0: ; %entry
1761; GFX8-NEXT:    v_mov_b32_e32 v1, v0
1762; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1763; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
1764; GFX8-NEXT:    s_not_b64 exec, exec
1765; GFX8-NEXT:    v_mov_b32_e32 v1, 0
1766; GFX8-NEXT:    s_not_b64 exec, exec
1767; GFX8-NEXT:    s_or_saveexec_b64 s[0:1], -1
1768; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1769; GFX8-NEXT:    s_nop 1
1770; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1771; GFX8-NEXT:    s_nop 1
1772; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1773; GFX8-NEXT:    s_nop 1
1774; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1775; GFX8-NEXT:    s_nop 1
1776; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
1777; GFX8-NEXT:    s_nop 1
1778; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
1779; GFX8-NEXT:    v_readlane_b32 s2, v1, 63
1780; GFX8-NEXT:    s_mov_b64 exec, s[0:1]
1781; GFX8-NEXT:    s_mov_b32 s0, s2
1782; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1783; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1784; GFX8-NEXT:    s_cbranch_execz BB10_2
1785; GFX8-NEXT:  ; %bb.1:
1786; GFX8-NEXT:    v_mov_b32_e32 v0, 0
1787; GFX8-NEXT:    v_mov_b32_e32 v2, s0
1788; GFX8-NEXT:    s_mov_b32 m0, -1
1789; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1790; GFX8-NEXT:    ds_sub_u32 v0, v2
1791; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1792; GFX8-NEXT:  BB10_2:
1793; GFX8-NEXT:    s_endpgm
1794;
1795; GFX9-LABEL: sub_i32_varying_nouse:
1796; GFX9:       ; %bb.0: ; %entry
1797; GFX9-NEXT:    v_mov_b32_e32 v1, v0
1798; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1799; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
1800; GFX9-NEXT:    s_not_b64 exec, exec
1801; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1802; GFX9-NEXT:    s_not_b64 exec, exec
1803; GFX9-NEXT:    s_or_saveexec_b64 s[0:1], -1
1804; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1805; GFX9-NEXT:    s_nop 1
1806; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1807; GFX9-NEXT:    s_nop 1
1808; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1809; GFX9-NEXT:    s_nop 1
1810; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1811; GFX9-NEXT:    s_nop 1
1812; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
1813; GFX9-NEXT:    s_nop 1
1814; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
1815; GFX9-NEXT:    v_readlane_b32 s2, v1, 63
1816; GFX9-NEXT:    s_mov_b64 exec, s[0:1]
1817; GFX9-NEXT:    s_mov_b32 s0, s2
1818; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1819; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1820; GFX9-NEXT:    s_cbranch_execz BB10_2
1821; GFX9-NEXT:  ; %bb.1:
1822; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1823; GFX9-NEXT:    v_mov_b32_e32 v2, s0
1824; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1825; GFX9-NEXT:    ds_sub_u32 v0, v2
1826; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1827; GFX9-NEXT:  BB10_2:
1828; GFX9-NEXT:    s_endpgm
1829;
1830; GFX1064-LABEL: sub_i32_varying_nouse:
1831; GFX1064:       ; %bb.0: ; %entry
1832; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
1833; GFX1064-NEXT:    s_not_b64 exec, exec
1834; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
1835; GFX1064-NEXT:    s_not_b64 exec, exec
1836; GFX1064-NEXT:    s_or_saveexec_b64 s[0:1], -1
1837; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1838; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1839; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1840; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1841; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
1842; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
1843; GFX1064-NEXT:    v_add_nc_u32_e32 v1, v1, v2
1844; GFX1064-NEXT:    s_mov_b64 exec, s[0:1]
1845; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1846; GFX1064-NEXT:    s_or_saveexec_b64 s[0:1], -1
1847; GFX1064-NEXT:    v_readlane_b32 s2, v1, 0
1848; GFX1064-NEXT:    v_readlane_b32 s3, v1, 32
1849; GFX1064-NEXT:    s_mov_b64 exec, s[0:1]
1850; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
1851; GFX1064-NEXT:    s_add_i32 s0, s2, s3
1852; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1853; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1854; GFX1064-NEXT:    s_cbranch_execz BB10_2
1855; GFX1064-NEXT:  ; %bb.1:
1856; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
1857; GFX1064-NEXT:    v_mov_b32_e32 v3, s0
1858; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1859; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1860; GFX1064-NEXT:    ds_sub_u32 v0, v3
1861; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1862; GFX1064-NEXT:    buffer_gl0_inv
1863; GFX1064-NEXT:  BB10_2:
1864; GFX1064-NEXT:    s_endpgm
1865;
1866; GFX1032-LABEL: sub_i32_varying_nouse:
1867; GFX1032:       ; %bb.0: ; %entry
1868; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
1869; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
1870; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
1871; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
1872; GFX1032-NEXT:    s_or_saveexec_b32 s0, -1
1873; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1874; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1875; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1876; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1877; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
1878; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
1879; GFX1032-NEXT:    v_add_nc_u32_e32 v1, v1, v2
1880; GFX1032-NEXT:    s_mov_b32 exec_lo, s0
1881; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
1882; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
1883; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
1884; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
1885; GFX1032-NEXT:    s_cbranch_execz BB10_2
1886; GFX1032-NEXT:  ; %bb.1:
1887; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
1888; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1889; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1890; GFX1032-NEXT:    ds_sub_u32 v3, v0
1891; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1892; GFX1032-NEXT:    buffer_gl0_inv
1893; GFX1032-NEXT:  BB10_2:
1894; GFX1032-NEXT:    s_endpgm
1895entry:
1896  %lane = call i32 @llvm.amdgcn.workitem.id.x()
1897  %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %lane acq_rel
1898  ret void
1899}
1900
1901define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) {
1902;
1903;
1904; GFX7LESS-LABEL: sub_i64_constant:
1905; GFX7LESS:       ; %bb.0: ; %entry
1906; GFX7LESS-NEXT:    s_mov_b64 s[4:5], exec
1907; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1908; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s4, 0
1909; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s5, v0
1910; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1911; GFX7LESS-NEXT:    ; implicit-def: $vgpr1_vgpr2
1912; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1913; GFX7LESS-NEXT:    s_cbranch_execz BB11_2
1914; GFX7LESS-NEXT:  ; %bb.1:
1915; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1916; GFX7LESS-NEXT:    s_mul_i32 s4, s4, 5
1917; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0
1918; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s4
1919; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1920; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1921; GFX7LESS-NEXT:    ds_sub_rtn_u64 v[1:2], v2, v[1:2]
1922; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1923; GFX7LESS-NEXT:  BB11_2:
1924; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
1925; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1926; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v1
1927; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v2
1928; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v0
1929; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1930; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
1931; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
1932; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
1933; GFX7LESS-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
1934; GFX7LESS-NEXT:    s_mov_b32 s2, -1
1935; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1936; GFX7LESS-NEXT:    s_endpgm
1937;
1938; GFX8-LABEL: sub_i64_constant:
1939; GFX8:       ; %bb.0: ; %entry
1940; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1941; GFX8-NEXT:    s_mov_b64 s[4:5], exec
1942; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
1943; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
1944; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1945; GFX8-NEXT:    ; implicit-def: $vgpr1_vgpr2
1946; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1947; GFX8-NEXT:    s_cbranch_execz BB11_2
1948; GFX8-NEXT:  ; %bb.1:
1949; GFX8-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1950; GFX8-NEXT:    s_mul_i32 s4, s4, 5
1951; GFX8-NEXT:    v_mov_b32_e32 v1, s4
1952; GFX8-NEXT:    v_mov_b32_e32 v2, 0
1953; GFX8-NEXT:    s_mov_b32 m0, -1
1954; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1955; GFX8-NEXT:    ds_sub_rtn_u64 v[1:2], v2, v[1:2]
1956; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1957; GFX8-NEXT:  BB11_2:
1958; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
1959; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1960; GFX8-NEXT:    v_readfirstlane_b32 s3, v2
1961; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
1962; GFX8-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v0
1963; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1964; GFX8-NEXT:    v_mov_b32_e32 v2, s3
1965; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s2, v0
1966; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
1967; GFX8-NEXT:    s_mov_b32 s3, 0xf000
1968; GFX8-NEXT:    s_mov_b32 s2, -1
1969; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1970; GFX8-NEXT:    s_endpgm
1971;
1972; GFX9-LABEL: sub_i64_constant:
1973; GFX9:       ; %bb.0: ; %entry
1974; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1975; GFX9-NEXT:    s_mov_b64 s[4:5], exec
1976; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
1977; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
1978; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1979; GFX9-NEXT:    ; implicit-def: $vgpr1_vgpr2
1980; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1981; GFX9-NEXT:    s_cbranch_execz BB11_2
1982; GFX9-NEXT:  ; %bb.1:
1983; GFX9-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1984; GFX9-NEXT:    s_mul_i32 s4, s4, 5
1985; GFX9-NEXT:    v_mov_b32_e32 v1, s4
1986; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1987; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1988; GFX9-NEXT:    ds_sub_rtn_u64 v[1:2], v2, v[1:2]
1989; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1990; GFX9-NEXT:  BB11_2:
1991; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
1992; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1993; GFX9-NEXT:    v_readfirstlane_b32 s3, v2
1994; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
1995; GFX9-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v0
1996; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1997; GFX9-NEXT:    v_mov_b32_e32 v2, s3
1998; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s2, v0
1999; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
2000; GFX9-NEXT:    s_mov_b32 s3, 0xf000
2001; GFX9-NEXT:    s_mov_b32 s2, -1
2002; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2003; GFX9-NEXT:    s_endpgm
2004;
2005; GFX1064-LABEL: sub_i64_constant:
2006; GFX1064:       ; %bb.0: ; %entry
2007; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2008; GFX1064-NEXT:    s_mov_b64 s[4:5], exec
2009; GFX1064-NEXT:    ; implicit-def: $vgpr1_vgpr2
2010; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
2011; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
2012; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2013; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2014; GFX1064-NEXT:    s_cbranch_execz BB11_2
2015; GFX1064-NEXT:  ; %bb.1:
2016; GFX1064-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
2017; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
2018; GFX1064-NEXT:    s_mul_i32 s4, s4, 5
2019; GFX1064-NEXT:    v_mov_b32_e32 v1, s4
2020; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2021; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2022; GFX1064-NEXT:    ds_sub_rtn_u64 v[1:2], v2, v[1:2]
2023; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2024; GFX1064-NEXT:    buffer_gl0_inv
2025; GFX1064-NEXT:  BB11_2:
2026; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
2027; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
2028; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
2029; GFX1064-NEXT:    v_mul_u32_u24_e32 v1, 5, v0
2030; GFX1064-NEXT:    v_readfirstlane_b32 s3, v2
2031; GFX1064-NEXT:    v_mul_hi_u32_u24_e32 v2, 5, v0
2032; GFX1064-NEXT:    v_sub_co_u32 v0, vcc, s2, v1
2033; GFX1064-NEXT:    s_mov_b32 s2, -1
2034; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc
2035; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
2036; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2037; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2038; GFX1064-NEXT:    s_endpgm
2039;
2040; GFX1032-LABEL: sub_i64_constant:
2041; GFX1032:       ; %bb.0: ; %entry
2042; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2043; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
2044; GFX1032-NEXT:    ; implicit-def: $vgpr1_vgpr2
2045; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
2046; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
2047; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
2048; GFX1032-NEXT:    s_cbranch_execz BB11_2
2049; GFX1032-NEXT:  ; %bb.1:
2050; GFX1032-NEXT:    s_bcnt1_i32_b32 s3, s3
2051; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
2052; GFX1032-NEXT:    s_mul_i32 s3, s3, 5
2053; GFX1032-NEXT:    v_mov_b32_e32 v1, s3
2054; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2055; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2056; GFX1032-NEXT:    ds_sub_rtn_u64 v[1:2], v2, v[1:2]
2057; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2058; GFX1032-NEXT:    buffer_gl0_inv
2059; GFX1032-NEXT:  BB11_2:
2060; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
2061; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
2062; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
2063; GFX1032-NEXT:    v_mul_u32_u24_e32 v1, 5, v0
2064; GFX1032-NEXT:    v_readfirstlane_b32 s3, v2
2065; GFX1032-NEXT:    v_mul_hi_u32_u24_e32 v2, 5, v0
2066; GFX1032-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v1
2067; GFX1032-NEXT:    s_mov_b32 s2, -1
2068; GFX1032-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo
2069; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
2070; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2071; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2072; GFX1032-NEXT:    s_endpgm
2073entry:
2074  %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 5 acq_rel
2075  store i64 %old, i64 addrspace(1)* %out
2076  ret void
2077}
2078
2079define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive) {
2080;
2081;
2082; GFX7LESS-LABEL: sub_i64_uniform:
2083; GFX7LESS:       ; %bb.0: ; %entry
2084; GFX7LESS-NEXT:    s_mov_b64 s[6:7], exec
2085; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
2086; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
2087; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s7, v0
2088; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2089; GFX7LESS-NEXT:    ; implicit-def: $vgpr1_vgpr2
2090; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2091; GFX7LESS-NEXT:    s_cbranch_execz BB12_2
2092; GFX7LESS-NEXT:  ; %bb.1:
2093; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
2094; GFX7LESS-NEXT:    v_mov_b32_e32 v3, 0
2095; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2096; GFX7LESS-NEXT:    s_mul_i32 s7, s3, s6
2097; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
2098; GFX7LESS-NEXT:    v_mul_hi_u32 v1, s2, v1
2099; GFX7LESS-NEXT:    s_mul_i32 s6, s2, s6
2100; GFX7LESS-NEXT:    v_add_i32_e32 v2, vcc, s7, v1
2101; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
2102; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2103; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2104; GFX7LESS-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2105; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2106; GFX7LESS-NEXT:  BB12_2:
2107; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
2108; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
2109; GFX7LESS-NEXT:    s_mov_b32 s6, -1
2110; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2111; GFX7LESS-NEXT:    s_mov_b32 s4, s0
2112; GFX7LESS-NEXT:    s_mov_b32 s5, s1
2113; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v1
2114; GFX7LESS-NEXT:    v_readfirstlane_b32 s1, v2
2115; GFX7LESS-NEXT:    v_mul_lo_u32 v1, s3, v0
2116; GFX7LESS-NEXT:    v_mul_hi_u32 v2, s2, v0
2117; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s2, v0
2118; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
2119; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s1
2120; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
2121; GFX7LESS-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
2122; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2123; GFX7LESS-NEXT:    s_endpgm
2124;
2125; GFX8-LABEL: sub_i64_uniform:
2126; GFX8:       ; %bb.0: ; %entry
2127; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2128; GFX8-NEXT:    s_mov_b64 s[6:7], exec
2129; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
2130; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
2131; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2132; GFX8-NEXT:    ; implicit-def: $vgpr1_vgpr2
2133; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2134; GFX8-NEXT:    s_cbranch_execz BB12_2
2135; GFX8-NEXT:  ; %bb.1:
2136; GFX8-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
2137; GFX8-NEXT:    v_mov_b32_e32 v1, s6
2138; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2139; GFX8-NEXT:    v_mul_hi_u32 v1, s2, v1
2140; GFX8-NEXT:    s_mul_i32 s7, s3, s6
2141; GFX8-NEXT:    s_mul_i32 s6, s2, s6
2142; GFX8-NEXT:    v_mov_b32_e32 v3, 0
2143; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s7, v1
2144; GFX8-NEXT:    v_mov_b32_e32 v1, s6
2145; GFX8-NEXT:    s_mov_b32 m0, -1
2146; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2147; GFX8-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2148; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2149; GFX8-NEXT:  BB12_2:
2150; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
2151; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2152; GFX8-NEXT:    s_mov_b32 s4, s0
2153; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
2154; GFX8-NEXT:    v_mul_lo_u32 v1, s3, v0
2155; GFX8-NEXT:    v_mul_hi_u32 v3, s2, v0
2156; GFX8-NEXT:    v_mul_lo_u32 v0, s2, v0
2157; GFX8-NEXT:    s_mov_b32 s5, s1
2158; GFX8-NEXT:    v_readfirstlane_b32 s1, v2
2159; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
2160; GFX8-NEXT:    v_mov_b32_e32 v2, s1
2161; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s0, v0
2162; GFX8-NEXT:    s_mov_b32 s7, 0xf000
2163; GFX8-NEXT:    s_mov_b32 s6, -1
2164; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
2165; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2166; GFX8-NEXT:    s_endpgm
2167;
2168; GFX9-LABEL: sub_i64_uniform:
2169; GFX9:       ; %bb.0: ; %entry
2170; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2171; GFX9-NEXT:    s_mov_b64 s[6:7], exec
2172; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
2173; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
2174; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2175; GFX9-NEXT:    ; implicit-def: $vgpr1_vgpr2
2176; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2177; GFX9-NEXT:    s_cbranch_execz BB12_2
2178; GFX9-NEXT:  ; %bb.1:
2179; GFX9-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
2180; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2181; GFX9-NEXT:    s_mul_i32 s7, s3, s6
2182; GFX9-NEXT:    s_mul_hi_u32 s8, s2, s6
2183; GFX9-NEXT:    s_add_i32 s8, s8, s7
2184; GFX9-NEXT:    s_mul_i32 s6, s2, s6
2185; GFX9-NEXT:    v_mov_b32_e32 v1, s6
2186; GFX9-NEXT:    v_mov_b32_e32 v2, s8
2187; GFX9-NEXT:    v_mov_b32_e32 v3, 0
2188; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2189; GFX9-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2190; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2191; GFX9-NEXT:  BB12_2:
2192; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
2193; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2194; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v0
2195; GFX9-NEXT:    v_mul_hi_u32 v4, s2, v0
2196; GFX9-NEXT:    v_mul_lo_u32 v0, s2, v0
2197; GFX9-NEXT:    s_mov_b32 s4, s0
2198; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
2199; GFX9-NEXT:    s_mov_b32 s5, s1
2200; GFX9-NEXT:    v_readfirstlane_b32 s1, v2
2201; GFX9-NEXT:    v_add_u32_e32 v1, v4, v3
2202; GFX9-NEXT:    v_mov_b32_e32 v2, s1
2203; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s0, v0
2204; GFX9-NEXT:    s_mov_b32 s7, 0xf000
2205; GFX9-NEXT:    s_mov_b32 s6, -1
2206; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
2207; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2208; GFX9-NEXT:    s_endpgm
2209;
2210; GFX1064-LABEL: sub_i64_uniform:
2211; GFX1064:       ; %bb.0: ; %entry
2212; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2213; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
2214; GFX1064-NEXT:    ; implicit-def: $vgpr1_vgpr2
2215; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
2216; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
2217; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2218; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2219; GFX1064-NEXT:    s_cbranch_execz BB12_2
2220; GFX1064-NEXT:  ; %bb.1:
2221; GFX1064-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
2222; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
2223; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2224; GFX1064-NEXT:    s_mul_i32 s7, s3, s6
2225; GFX1064-NEXT:    s_mul_hi_u32 s8, s2, s6
2226; GFX1064-NEXT:    s_mul_i32 s6, s2, s6
2227; GFX1064-NEXT:    s_add_i32 s8, s8, s7
2228; GFX1064-NEXT:    v_mov_b32_e32 v1, s6
2229; GFX1064-NEXT:    v_mov_b32_e32 v2, s8
2230; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2231; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2232; GFX1064-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2233; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2234; GFX1064-NEXT:    buffer_gl0_inv
2235; GFX1064-NEXT:  BB12_2:
2236; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
2237; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
2238; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2239; GFX1064-NEXT:    v_mul_lo_u32 v3, s3, v0
2240; GFX1064-NEXT:    v_mul_hi_u32 v4, s2, v0
2241; GFX1064-NEXT:    v_mul_lo_u32 v0, s2, v0
2242; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
2243; GFX1064-NEXT:    v_readfirstlane_b32 s4, v2
2244; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
2245; GFX1064-NEXT:    v_add_nc_u32_e32 v1, v4, v3
2246; GFX1064-NEXT:    v_sub_co_u32 v0, vcc, s2, v0
2247; GFX1064-NEXT:    s_mov_b32 s2, -1
2248; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc
2249; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2250; GFX1064-NEXT:    s_endpgm
2251;
2252; GFX1032-LABEL: sub_i64_uniform:
2253; GFX1032:       ; %bb.0: ; %entry
2254; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2255; GFX1032-NEXT:    s_mov_b32 s5, exec_lo
2256; GFX1032-NEXT:    ; implicit-def: $vgpr1_vgpr2
2257; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s5, 0
2258; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
2259; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
2260; GFX1032-NEXT:    s_cbranch_execz BB12_2
2261; GFX1032-NEXT:  ; %bb.1:
2262; GFX1032-NEXT:    s_bcnt1_i32_b32 s5, s5
2263; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
2264; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2265; GFX1032-NEXT:    s_mul_i32 s6, s3, s5
2266; GFX1032-NEXT:    s_mul_hi_u32 s7, s2, s5
2267; GFX1032-NEXT:    s_mul_i32 s5, s2, s5
2268; GFX1032-NEXT:    s_add_i32 s7, s7, s6
2269; GFX1032-NEXT:    v_mov_b32_e32 v1, s5
2270; GFX1032-NEXT:    v_mov_b32_e32 v2, s7
2271; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2272; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2273; GFX1032-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2274; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2275; GFX1032-NEXT:    buffer_gl0_inv
2276; GFX1032-NEXT:  BB12_2:
2277; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
2278; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
2279; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2280; GFX1032-NEXT:    v_mul_lo_u32 v3, s3, v0
2281; GFX1032-NEXT:    v_mul_hi_u32 v4, s2, v0
2282; GFX1032-NEXT:    v_mul_lo_u32 v0, s2, v0
2283; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
2284; GFX1032-NEXT:    v_readfirstlane_b32 s4, v2
2285; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
2286; GFX1032-NEXT:    v_add_nc_u32_e32 v1, v4, v3
2287; GFX1032-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v0
2288; GFX1032-NEXT:    s_mov_b32 s2, -1
2289; GFX1032-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo
2290; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2291; GFX1032-NEXT:    s_endpgm
2292entry:
2293  %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %subitive acq_rel
2294  store i64 %old, i64 addrspace(1)* %out
2295  ret void
2296}
2297
2298define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out) {
2299;
2300;
2301; GFX7LESS-LABEL: sub_i64_varying:
2302; GFX7LESS:       ; %bb.0: ; %entry
2303; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2304; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
2305; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2306; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2307; GFX7LESS-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
2308; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2309; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
2310; GFX7LESS-NEXT:    s_mov_b32 s2, -1
2311; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2312; GFX7LESS-NEXT:    s_endpgm
2313;
2314; GFX8-LABEL: sub_i64_varying:
2315; GFX8:       ; %bb.0: ; %entry
2316; GFX8-NEXT:    v_mov_b32_e32 v1, 0
2317; GFX8-NEXT:    s_mov_b32 m0, -1
2318; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2319; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2320; GFX8-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
2321; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2322; GFX8-NEXT:    s_mov_b32 s3, 0xf000
2323; GFX8-NEXT:    s_mov_b32 s2, -1
2324; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2325; GFX8-NEXT:    s_endpgm
2326;
2327; GFX9-LABEL: sub_i64_varying:
2328; GFX9:       ; %bb.0: ; %entry
2329; GFX9-NEXT:    v_mov_b32_e32 v1, 0
2330; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2331; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2332; GFX9-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
2333; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2334; GFX9-NEXT:    s_mov_b32 s3, 0xf000
2335; GFX9-NEXT:    s_mov_b32 s2, -1
2336; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2337; GFX9-NEXT:    s_endpgm
2338;
2339; GFX10-LABEL: sub_i64_varying:
2340; GFX10:       ; %bb.0: ; %entry
2341; GFX10-NEXT:    v_mov_b32_e32 v1, 0
2342; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2343; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
2344; GFX10-NEXT:    s_mov_b32 s2, -1
2345; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2346; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2347; GFX10-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
2348; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2349; GFX10-NEXT:    buffer_gl0_inv
2350; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2351; GFX10-NEXT:    s_endpgm
2352entry:
2353  %lane = call i32 @llvm.amdgcn.workitem.id.x()
2354  %zext = zext i32 %lane to i64
2355  %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %zext acq_rel
2356  store i64 %old, i64 addrspace(1)* %out
2357  ret void
2358}
2359
2360define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) {
2361;
2362;
2363; GFX7LESS-LABEL: and_i32_varying:
2364; GFX7LESS:       ; %bb.0: ; %entry
2365; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2366; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
2367; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2368; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2369; GFX7LESS-NEXT:    ds_and_rtn_b32 v0, v1, v0
2370; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2371; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
2372; GFX7LESS-NEXT:    s_mov_b32 s2, -1
2373; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2374; GFX7LESS-NEXT:    s_endpgm
2375;
2376; GFX8-LABEL: and_i32_varying:
2377; GFX8:       ; %bb.0: ; %entry
2378; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2379; GFX8-NEXT:    v_mov_b32_e32 v2, v0
2380; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2381; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2382; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
2383; GFX8-NEXT:    v_mov_b32_e32 v1, -1
2384; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
2385; GFX8-NEXT:    s_not_b64 exec, exec
2386; GFX8-NEXT:    v_mov_b32_e32 v2, -1
2387; GFX8-NEXT:    s_not_b64 exec, exec
2388; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
2389; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
2390; GFX8-NEXT:    s_nop 1
2391; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
2392; GFX8-NEXT:    s_nop 1
2393; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
2394; GFX8-NEXT:    s_nop 1
2395; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
2396; GFX8-NEXT:    s_nop 1
2397; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
2398; GFX8-NEXT:    s_nop 1
2399; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
2400; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
2401; GFX8-NEXT:    s_nop 0
2402; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
2403; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
2404; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2405; GFX8-NEXT:    ; implicit-def: $vgpr0
2406; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2407; GFX8-NEXT:    s_cbranch_execz BB14_2
2408; GFX8-NEXT:  ; %bb.1:
2409; GFX8-NEXT:    v_mov_b32_e32 v0, 0
2410; GFX8-NEXT:    v_mov_b32_e32 v3, s4
2411; GFX8-NEXT:    s_mov_b32 m0, -1
2412; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2413; GFX8-NEXT:    ds_and_rtn_b32 v0, v0, v3
2414; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2415; GFX8-NEXT:  BB14_2:
2416; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
2417; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2418; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
2419; GFX8-NEXT:    v_mov_b32_e32 v0, v1
2420; GFX8-NEXT:    v_and_b32_e32 v0, s2, v0
2421; GFX8-NEXT:    s_mov_b32 s3, 0xf000
2422; GFX8-NEXT:    s_mov_b32 s2, -1
2423; GFX8-NEXT:    s_nop 0
2424; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2425; GFX8-NEXT:    s_endpgm
2426;
2427; GFX9-LABEL: and_i32_varying:
2428; GFX9:       ; %bb.0: ; %entry
2429; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2430; GFX9-NEXT:    v_mov_b32_e32 v2, v0
2431; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2432; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2433; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
2434; GFX9-NEXT:    v_mov_b32_e32 v1, -1
2435; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
2436; GFX9-NEXT:    s_not_b64 exec, exec
2437; GFX9-NEXT:    v_mov_b32_e32 v2, -1
2438; GFX9-NEXT:    s_not_b64 exec, exec
2439; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
2440; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
2441; GFX9-NEXT:    s_nop 1
2442; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
2443; GFX9-NEXT:    s_nop 1
2444; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
2445; GFX9-NEXT:    s_nop 1
2446; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
2447; GFX9-NEXT:    s_nop 1
2448; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
2449; GFX9-NEXT:    s_nop 1
2450; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
2451; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
2452; GFX9-NEXT:    s_nop 0
2453; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
2454; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
2455; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2456; GFX9-NEXT:    ; implicit-def: $vgpr0
2457; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2458; GFX9-NEXT:    s_cbranch_execz BB14_2
2459; GFX9-NEXT:  ; %bb.1:
2460; GFX9-NEXT:    v_mov_b32_e32 v0, 0
2461; GFX9-NEXT:    v_mov_b32_e32 v3, s4
2462; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2463; GFX9-NEXT:    ds_and_rtn_b32 v0, v0, v3
2464; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2465; GFX9-NEXT:  BB14_2:
2466; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
2467; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2468; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
2469; GFX9-NEXT:    v_mov_b32_e32 v0, v1
2470; GFX9-NEXT:    v_and_b32_e32 v0, s2, v0
2471; GFX9-NEXT:    s_mov_b32 s3, 0xf000
2472; GFX9-NEXT:    s_mov_b32 s2, -1
2473; GFX9-NEXT:    s_nop 0
2474; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2475; GFX9-NEXT:    s_endpgm
2476;
2477; GFX1064-LABEL: and_i32_varying:
2478; GFX1064:       ; %bb.0: ; %entry
2479; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
2480; GFX1064-NEXT:    s_not_b64 exec, exec
2481; GFX1064-NEXT:    v_mov_b32_e32 v1, -1
2482; GFX1064-NEXT:    s_not_b64 exec, exec
2483; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2484; GFX1064-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2485; GFX1064-NEXT:    v_mov_b32_e32 v3, -1
2486; GFX1064-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
2487; GFX1064-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
2488; GFX1064-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
2489; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
2490; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2491; GFX1064-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2492; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
2493; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
2494; GFX1064-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
2495; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
2496; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2497; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2498; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2499; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2500; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
2501; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
2502; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2503; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2504; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2505; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
2506; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
2507; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
2508; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2509; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2510; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
2511; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
2512; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
2513; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2514; GFX1064-NEXT:    s_mov_b32 s2, -1
2515; GFX1064-NEXT:    ; implicit-def: $vgpr0
2516; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2517; GFX1064-NEXT:    s_cbranch_execz BB14_2
2518; GFX1064-NEXT:  ; %bb.1:
2519; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
2520; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
2521; GFX1064-NEXT:    s_mov_b32 s3, s7
2522; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2523; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2524; GFX1064-NEXT:    ds_and_rtn_b32 v0, v0, v4
2525; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2526; GFX1064-NEXT:    buffer_gl0_inv
2527; GFX1064-NEXT:  BB14_2:
2528; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
2529; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
2530; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
2531; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
2532; GFX1064-NEXT:    v_and_b32_e32 v0, s3, v0
2533; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
2534; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2535; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2536; GFX1064-NEXT:    s_endpgm
2537;
2538; GFX1032-LABEL: and_i32_varying:
2539; GFX1032:       ; %bb.0: ; %entry
2540; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
2541; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2542; GFX1032-NEXT:    v_mov_b32_e32 v1, -1
2543; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2544; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
2545; GFX1032-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2546; GFX1032-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
2547; GFX1032-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
2548; GFX1032-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
2549; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
2550; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2551; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
2552; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2553; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
2554; GFX1032-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2555; GFX1032-NEXT:    v_mov_b32_e32 v3, -1
2556; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
2557; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
2558; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2559; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
2560; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2561; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
2562; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
2563; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
2564; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
2565; GFX1032-NEXT:    s_mov_b32 s2, -1
2566; GFX1032-NEXT:    ; implicit-def: $vgpr0
2567; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
2568; GFX1032-NEXT:    s_cbranch_execz BB14_2
2569; GFX1032-NEXT:  ; %bb.1:
2570; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
2571; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
2572; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2573; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2574; GFX1032-NEXT:    ds_and_rtn_b32 v0, v0, v4
2575; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2576; GFX1032-NEXT:    buffer_gl0_inv
2577; GFX1032-NEXT:  BB14_2:
2578; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
2579; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
2580; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
2581; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
2582; GFX1032-NEXT:    v_and_b32_e32 v0, s3, v0
2583; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
2584; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2585; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2586; GFX1032-NEXT:    s_endpgm
2587entry:
2588  %lane = call i32 @llvm.amdgcn.workitem.id.x()
2589  %old = atomicrmw and i32 addrspace(3)* @local_var32, i32 %lane acq_rel
2590  store i32 %old, i32 addrspace(1)* %out
2591  ret void
2592}
2593
2594define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
2595;
2596;
2597; GFX7LESS-LABEL: or_i32_varying:
2598; GFX7LESS:       ; %bb.0: ; %entry
2599; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2600; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
2601; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2602; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2603; GFX7LESS-NEXT:    ds_or_rtn_b32 v0, v1, v0
2604; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2605; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
2606; GFX7LESS-NEXT:    s_mov_b32 s2, -1
2607; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2608; GFX7LESS-NEXT:    s_endpgm
2609;
2610; GFX8-LABEL: or_i32_varying:
2611; GFX8:       ; %bb.0: ; %entry
2612; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2613; GFX8-NEXT:    v_mov_b32_e32 v2, v0
2614; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
2615; GFX8-NEXT:    v_mov_b32_e32 v1, 0
2616; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
2617; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2618; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2619; GFX8-NEXT:    s_not_b64 exec, exec
2620; GFX8-NEXT:    v_mov_b32_e32 v2, 0
2621; GFX8-NEXT:    s_not_b64 exec, exec
2622; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
2623; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2624; GFX8-NEXT:    s_nop 1
2625; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2626; GFX8-NEXT:    s_nop 1
2627; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2628; GFX8-NEXT:    s_nop 1
2629; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2630; GFX8-NEXT:    s_nop 1
2631; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
2632; GFX8-NEXT:    s_nop 1
2633; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
2634; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
2635; GFX8-NEXT:    s_nop 0
2636; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
2637; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
2638; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2639; GFX8-NEXT:    ; implicit-def: $vgpr0
2640; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2641; GFX8-NEXT:    s_cbranch_execz BB15_2
2642; GFX8-NEXT:  ; %bb.1:
2643; GFX8-NEXT:    v_mov_b32_e32 v0, 0
2644; GFX8-NEXT:    v_mov_b32_e32 v3, s4
2645; GFX8-NEXT:    s_mov_b32 m0, -1
2646; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2647; GFX8-NEXT:    ds_or_rtn_b32 v0, v0, v3
2648; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2649; GFX8-NEXT:  BB15_2:
2650; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
2651; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2652; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
2653; GFX8-NEXT:    v_mov_b32_e32 v0, v1
2654; GFX8-NEXT:    v_or_b32_e32 v0, s2, v0
2655; GFX8-NEXT:    s_mov_b32 s3, 0xf000
2656; GFX8-NEXT:    s_mov_b32 s2, -1
2657; GFX8-NEXT:    s_nop 0
2658; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2659; GFX8-NEXT:    s_endpgm
2660;
2661; GFX9-LABEL: or_i32_varying:
2662; GFX9:       ; %bb.0: ; %entry
2663; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2664; GFX9-NEXT:    v_mov_b32_e32 v2, v0
2665; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
2666; GFX9-NEXT:    v_mov_b32_e32 v1, 0
2667; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
2668; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2669; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2670; GFX9-NEXT:    s_not_b64 exec, exec
2671; GFX9-NEXT:    v_mov_b32_e32 v2, 0
2672; GFX9-NEXT:    s_not_b64 exec, exec
2673; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
2674; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2675; GFX9-NEXT:    s_nop 1
2676; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2677; GFX9-NEXT:    s_nop 1
2678; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2679; GFX9-NEXT:    s_nop 1
2680; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2681; GFX9-NEXT:    s_nop 1
2682; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
2683; GFX9-NEXT:    s_nop 1
2684; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
2685; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
2686; GFX9-NEXT:    s_nop 0
2687; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
2688; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
2689; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2690; GFX9-NEXT:    ; implicit-def: $vgpr0
2691; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2692; GFX9-NEXT:    s_cbranch_execz BB15_2
2693; GFX9-NEXT:  ; %bb.1:
2694; GFX9-NEXT:    v_mov_b32_e32 v0, 0
2695; GFX9-NEXT:    v_mov_b32_e32 v3, s4
2696; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2697; GFX9-NEXT:    ds_or_rtn_b32 v0, v0, v3
2698; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2699; GFX9-NEXT:  BB15_2:
2700; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
2701; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2702; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
2703; GFX9-NEXT:    v_mov_b32_e32 v0, v1
2704; GFX9-NEXT:    v_or_b32_e32 v0, s2, v0
2705; GFX9-NEXT:    s_mov_b32 s3, 0xf000
2706; GFX9-NEXT:    s_mov_b32 s2, -1
2707; GFX9-NEXT:    s_nop 0
2708; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2709; GFX9-NEXT:    s_endpgm
2710;
2711; GFX1064-LABEL: or_i32_varying:
2712; GFX1064:       ; %bb.0: ; %entry
2713; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
2714; GFX1064-NEXT:    s_not_b64 exec, exec
2715; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
2716; GFX1064-NEXT:    s_not_b64 exec, exec
2717; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2718; GFX1064-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2719; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
2720; GFX1064-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2721; GFX1064-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2722; GFX1064-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2723; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
2724; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2725; GFX1064-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2726; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
2727; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
2728; GFX1064-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
2729; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
2730; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2731; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2732; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2733; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2734; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
2735; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
2736; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2737; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2738; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2739; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
2740; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
2741; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
2742; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2743; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2744; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
2745; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
2746; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
2747; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2748; GFX1064-NEXT:    s_mov_b32 s2, -1
2749; GFX1064-NEXT:    ; implicit-def: $vgpr0
2750; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2751; GFX1064-NEXT:    s_cbranch_execz BB15_2
2752; GFX1064-NEXT:  ; %bb.1:
2753; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
2754; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
2755; GFX1064-NEXT:    s_mov_b32 s3, s7
2756; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2757; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2758; GFX1064-NEXT:    ds_or_rtn_b32 v0, v0, v4
2759; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2760; GFX1064-NEXT:    buffer_gl0_inv
2761; GFX1064-NEXT:  BB15_2:
2762; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
2763; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
2764; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
2765; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
2766; GFX1064-NEXT:    v_or_b32_e32 v0, s3, v0
2767; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
2768; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2769; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2770; GFX1064-NEXT:    s_endpgm
2771;
2772; GFX1032-LABEL: or_i32_varying:
2773; GFX1032:       ; %bb.0: ; %entry
2774; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
2775; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2776; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
2777; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2778; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
2779; GFX1032-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2780; GFX1032-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2781; GFX1032-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2782; GFX1032-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2783; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
2784; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2785; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
2786; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2787; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
2788; GFX1032-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2789; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
2790; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
2791; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
2792; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2793; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
2794; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2795; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
2796; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
2797; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
2798; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
2799; GFX1032-NEXT:    s_mov_b32 s2, -1
2800; GFX1032-NEXT:    ; implicit-def: $vgpr0
2801; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
2802; GFX1032-NEXT:    s_cbranch_execz BB15_2
2803; GFX1032-NEXT:  ; %bb.1:
2804; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
2805; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
2806; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2807; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2808; GFX1032-NEXT:    ds_or_rtn_b32 v0, v0, v4
2809; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2810; GFX1032-NEXT:    buffer_gl0_inv
2811; GFX1032-NEXT:  BB15_2:
2812; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
2813; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
2814; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
2815; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
2816; GFX1032-NEXT:    v_or_b32_e32 v0, s3, v0
2817; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
2818; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2819; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2820; GFX1032-NEXT:    s_endpgm
2821entry:
2822  %lane = call i32 @llvm.amdgcn.workitem.id.x()
2823  %old = atomicrmw or i32 addrspace(3)* @local_var32, i32 %lane acq_rel
2824  store i32 %old, i32 addrspace(1)* %out
2825  ret void
2826}
2827
2828define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
2829;
2830;
2831; GFX7LESS-LABEL: xor_i32_varying:
2832; GFX7LESS:       ; %bb.0: ; %entry
2833; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2834; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
2835; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2836; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2837; GFX7LESS-NEXT:    ds_xor_rtn_b32 v0, v1, v0
2838; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2839; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
2840; GFX7LESS-NEXT:    s_mov_b32 s2, -1
2841; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2842; GFX7LESS-NEXT:    s_endpgm
2843;
2844; GFX8-LABEL: xor_i32_varying:
2845; GFX8:       ; %bb.0: ; %entry
2846; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2847; GFX8-NEXT:    v_mov_b32_e32 v2, v0
2848; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
2849; GFX8-NEXT:    v_mov_b32_e32 v1, 0
2850; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
2851; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2852; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2853; GFX8-NEXT:    s_not_b64 exec, exec
2854; GFX8-NEXT:    v_mov_b32_e32 v2, 0
2855; GFX8-NEXT:    s_not_b64 exec, exec
2856; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
2857; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2858; GFX8-NEXT:    s_nop 1
2859; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2860; GFX8-NEXT:    s_nop 1
2861; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2862; GFX8-NEXT:    s_nop 1
2863; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2864; GFX8-NEXT:    s_nop 1
2865; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
2866; GFX8-NEXT:    s_nop 1
2867; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
2868; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
2869; GFX8-NEXT:    s_nop 0
2870; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
2871; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
2872; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2873; GFX8-NEXT:    ; implicit-def: $vgpr0
2874; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2875; GFX8-NEXT:    s_cbranch_execz BB16_2
2876; GFX8-NEXT:  ; %bb.1:
2877; GFX8-NEXT:    v_mov_b32_e32 v0, 0
2878; GFX8-NEXT:    v_mov_b32_e32 v3, s4
2879; GFX8-NEXT:    s_mov_b32 m0, -1
2880; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2881; GFX8-NEXT:    ds_xor_rtn_b32 v0, v0, v3
2882; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2883; GFX8-NEXT:  BB16_2:
2884; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
2885; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2886; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
2887; GFX8-NEXT:    v_mov_b32_e32 v0, v1
2888; GFX8-NEXT:    v_xor_b32_e32 v0, s2, v0
2889; GFX8-NEXT:    s_mov_b32 s3, 0xf000
2890; GFX8-NEXT:    s_mov_b32 s2, -1
2891; GFX8-NEXT:    s_nop 0
2892; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2893; GFX8-NEXT:    s_endpgm
2894;
2895; GFX9-LABEL: xor_i32_varying:
2896; GFX9:       ; %bb.0: ; %entry
2897; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2898; GFX9-NEXT:    v_mov_b32_e32 v2, v0
2899; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
2900; GFX9-NEXT:    v_mov_b32_e32 v1, 0
2901; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
2902; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2903; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2904; GFX9-NEXT:    s_not_b64 exec, exec
2905; GFX9-NEXT:    v_mov_b32_e32 v2, 0
2906; GFX9-NEXT:    s_not_b64 exec, exec
2907; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
2908; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2909; GFX9-NEXT:    s_nop 1
2910; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2911; GFX9-NEXT:    s_nop 1
2912; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2913; GFX9-NEXT:    s_nop 1
2914; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2915; GFX9-NEXT:    s_nop 1
2916; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
2917; GFX9-NEXT:    s_nop 1
2918; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
2919; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
2920; GFX9-NEXT:    s_nop 0
2921; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
2922; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
2923; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2924; GFX9-NEXT:    ; implicit-def: $vgpr0
2925; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2926; GFX9-NEXT:    s_cbranch_execz BB16_2
2927; GFX9-NEXT:  ; %bb.1:
2928; GFX9-NEXT:    v_mov_b32_e32 v0, 0
2929; GFX9-NEXT:    v_mov_b32_e32 v3, s4
2930; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2931; GFX9-NEXT:    ds_xor_rtn_b32 v0, v0, v3
2932; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2933; GFX9-NEXT:  BB16_2:
2934; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
2935; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2936; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
2937; GFX9-NEXT:    v_mov_b32_e32 v0, v1
2938; GFX9-NEXT:    v_xor_b32_e32 v0, s2, v0
2939; GFX9-NEXT:    s_mov_b32 s3, 0xf000
2940; GFX9-NEXT:    s_mov_b32 s2, -1
2941; GFX9-NEXT:    s_nop 0
2942; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2943; GFX9-NEXT:    s_endpgm
2944;
2945; GFX1064-LABEL: xor_i32_varying:
2946; GFX1064:       ; %bb.0: ; %entry
2947; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
2948; GFX1064-NEXT:    s_not_b64 exec, exec
2949; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
2950; GFX1064-NEXT:    s_not_b64 exec, exec
2951; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2952; GFX1064-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2953; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
2954; GFX1064-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2955; GFX1064-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2956; GFX1064-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2957; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
2958; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2959; GFX1064-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2960; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
2961; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
2962; GFX1064-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
2963; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
2964; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2965; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2966; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2967; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2968; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
2969; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
2970; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2971; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2972; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2973; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
2974; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
2975; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
2976; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2977; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2978; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
2979; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
2980; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
2981; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2982; GFX1064-NEXT:    s_mov_b32 s2, -1
2983; GFX1064-NEXT:    ; implicit-def: $vgpr0
2984; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2985; GFX1064-NEXT:    s_cbranch_execz BB16_2
2986; GFX1064-NEXT:  ; %bb.1:
2987; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
2988; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
2989; GFX1064-NEXT:    s_mov_b32 s3, s7
2990; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2991; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2992; GFX1064-NEXT:    ds_xor_rtn_b32 v0, v0, v4
2993; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2994; GFX1064-NEXT:    buffer_gl0_inv
2995; GFX1064-NEXT:  BB16_2:
2996; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
2997; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
2998; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
2999; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
3000; GFX1064-NEXT:    v_xor_b32_e32 v0, s3, v0
3001; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
3002; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3003; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3004; GFX1064-NEXT:    s_endpgm
3005;
3006; GFX1032-LABEL: xor_i32_varying:
3007; GFX1032:       ; %bb.0: ; %entry
3008; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
3009; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3010; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
3011; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3012; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3013; GFX1032-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
3014; GFX1032-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
3015; GFX1032-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
3016; GFX1032-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
3017; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
3018; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
3019; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3020; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3021; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3022; GFX1032-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3023; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
3024; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
3025; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
3026; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3027; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3028; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3029; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3030; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
3031; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3032; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
3033; GFX1032-NEXT:    s_mov_b32 s2, -1
3034; GFX1032-NEXT:    ; implicit-def: $vgpr0
3035; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
3036; GFX1032-NEXT:    s_cbranch_execz BB16_2
3037; GFX1032-NEXT:  ; %bb.1:
3038; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
3039; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
3040; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3041; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
3042; GFX1032-NEXT:    ds_xor_rtn_b32 v0, v0, v4
3043; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3044; GFX1032-NEXT:    buffer_gl0_inv
3045; GFX1032-NEXT:  BB16_2:
3046; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
3047; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
3048; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
3049; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
3050; GFX1032-NEXT:    v_xor_b32_e32 v0, s3, v0
3051; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
3052; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3053; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3054; GFX1032-NEXT:    s_endpgm
3055entry:
3056  %lane = call i32 @llvm.amdgcn.workitem.id.x()
3057  %old = atomicrmw xor i32 addrspace(3)* @local_var32, i32 %lane acq_rel
3058  store i32 %old, i32 addrspace(1)* %out
3059  ret void
3060}
3061
3062define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
3063;
3064;
3065; GFX7LESS-LABEL: max_i32_varying:
3066; GFX7LESS:       ; %bb.0: ; %entry
3067; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3068; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
3069; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3070; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3071; GFX7LESS-NEXT:    ds_max_rtn_i32 v0, v1, v0
3072; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3073; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3074; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3075; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3076; GFX7LESS-NEXT:    s_endpgm
3077;
3078; GFX8-LABEL: max_i32_varying:
3079; GFX8:       ; %bb.0: ; %entry
3080; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3081; GFX8-NEXT:    v_mov_b32_e32 v2, v0
3082; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3083; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3084; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3085; GFX8-NEXT:    v_bfrev_b32_e32 v1, 1
3086; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3087; GFX8-NEXT:    s_not_b64 exec, exec
3088; GFX8-NEXT:    v_mov_b32_e32 v2, v1
3089; GFX8-NEXT:    s_not_b64 exec, exec
3090; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3091; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3092; GFX8-NEXT:    s_nop 1
3093; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3094; GFX8-NEXT:    s_nop 1
3095; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3096; GFX8-NEXT:    s_nop 1
3097; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3098; GFX8-NEXT:    s_nop 1
3099; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3100; GFX8-NEXT:    s_nop 1
3101; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3102; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
3103; GFX8-NEXT:    s_nop 0
3104; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3105; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3106; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3107; GFX8-NEXT:    ; implicit-def: $vgpr0
3108; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3109; GFX8-NEXT:    s_cbranch_execz BB17_2
3110; GFX8-NEXT:  ; %bb.1:
3111; GFX8-NEXT:    v_mov_b32_e32 v0, 0
3112; GFX8-NEXT:    v_mov_b32_e32 v3, s4
3113; GFX8-NEXT:    s_mov_b32 m0, -1
3114; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3115; GFX8-NEXT:    ds_max_rtn_i32 v0, v0, v3
3116; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3117; GFX8-NEXT:  BB17_2:
3118; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
3119; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3120; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
3121; GFX8-NEXT:    v_mov_b32_e32 v0, v1
3122; GFX8-NEXT:    v_max_i32_e32 v0, s2, v0
3123; GFX8-NEXT:    s_mov_b32 s3, 0xf000
3124; GFX8-NEXT:    s_mov_b32 s2, -1
3125; GFX8-NEXT:    s_nop 0
3126; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3127; GFX8-NEXT:    s_endpgm
3128;
3129; GFX9-LABEL: max_i32_varying:
3130; GFX9:       ; %bb.0: ; %entry
3131; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3132; GFX9-NEXT:    v_mov_b32_e32 v2, v0
3133; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3134; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3135; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3136; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
3137; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3138; GFX9-NEXT:    s_not_b64 exec, exec
3139; GFX9-NEXT:    v_mov_b32_e32 v2, v1
3140; GFX9-NEXT:    s_not_b64 exec, exec
3141; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3142; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3143; GFX9-NEXT:    s_nop 1
3144; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3145; GFX9-NEXT:    s_nop 1
3146; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3147; GFX9-NEXT:    s_nop 1
3148; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3149; GFX9-NEXT:    s_nop 1
3150; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3151; GFX9-NEXT:    s_nop 1
3152; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3153; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
3154; GFX9-NEXT:    s_nop 0
3155; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3156; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3157; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3158; GFX9-NEXT:    ; implicit-def: $vgpr0
3159; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3160; GFX9-NEXT:    s_cbranch_execz BB17_2
3161; GFX9-NEXT:  ; %bb.1:
3162; GFX9-NEXT:    v_mov_b32_e32 v0, 0
3163; GFX9-NEXT:    v_mov_b32_e32 v3, s4
3164; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3165; GFX9-NEXT:    ds_max_rtn_i32 v0, v0, v3
3166; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3167; GFX9-NEXT:  BB17_2:
3168; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
3169; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3170; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
3171; GFX9-NEXT:    v_mov_b32_e32 v0, v1
3172; GFX9-NEXT:    v_max_i32_e32 v0, s2, v0
3173; GFX9-NEXT:    s_mov_b32 s3, 0xf000
3174; GFX9-NEXT:    s_mov_b32 s2, -1
3175; GFX9-NEXT:    s_nop 0
3176; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3177; GFX9-NEXT:    s_endpgm
3178;
3179; GFX1064-LABEL: max_i32_varying:
3180; GFX1064:       ; %bb.0: ; %entry
3181; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
3182; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3183; GFX1064-NEXT:    v_bfrev_b32_e32 v1, 1
3184; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3185; GFX1064-NEXT:    s_not_b64 exec, exec
3186; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
3187; GFX1064-NEXT:    s_not_b64 exec, exec
3188; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3189; GFX1064-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3190; GFX1064-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3191; GFX1064-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3192; GFX1064-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3193; GFX1064-NEXT:    v_mov_b32_e32 v3, v2
3194; GFX1064-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
3195; GFX1064-NEXT:    v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3196; GFX1064-NEXT:    v_readlane_b32 s4, v2, 31
3197; GFX1064-NEXT:    v_mov_b32_e32 v3, s4
3198; GFX1064-NEXT:    v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
3199; GFX1064-NEXT:    v_readlane_b32 s4, v2, 15
3200; GFX1064-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3201; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3202; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3203; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3204; GFX1064-NEXT:    v_readlane_b32 s5, v2, 31
3205; GFX1064-NEXT:    v_writelane_b32 v1, s4, 16
3206; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3207; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3208; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3209; GFX1064-NEXT:    v_readlane_b32 s7, v2, 63
3210; GFX1064-NEXT:    v_readlane_b32 s6, v2, 47
3211; GFX1064-NEXT:    v_writelane_b32 v1, s5, 32
3212; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3213; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3214; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
3215; GFX1064-NEXT:    v_writelane_b32 v1, s6, 48
3216; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
3217; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3218; GFX1064-NEXT:    s_mov_b32 s2, -1
3219; GFX1064-NEXT:    ; implicit-def: $vgpr0
3220; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3221; GFX1064-NEXT:    s_cbranch_execz BB17_2
3222; GFX1064-NEXT:  ; %bb.1:
3223; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
3224; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
3225; GFX1064-NEXT:    s_mov_b32 s3, s7
3226; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3227; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
3228; GFX1064-NEXT:    ds_max_rtn_i32 v0, v0, v4
3229; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3230; GFX1064-NEXT:    buffer_gl0_inv
3231; GFX1064-NEXT:  BB17_2:
3232; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
3233; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
3234; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
3235; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
3236; GFX1064-NEXT:    v_max_i32_e32 v0, s3, v0
3237; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
3238; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3239; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3240; GFX1064-NEXT:    s_endpgm
3241;
3242; GFX1032-LABEL: max_i32_varying:
3243; GFX1032:       ; %bb.0: ; %entry
3244; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
3245; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3246; GFX1032-NEXT:    v_bfrev_b32_e32 v1, 1
3247; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3248; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3249; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
3250; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3251; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3252; GFX1032-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3253; GFX1032-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3254; GFX1032-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3255; GFX1032-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3256; GFX1032-NEXT:    v_mov_b32_e32 v3, v2
3257; GFX1032-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
3258; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3259; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3260; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3261; GFX1032-NEXT:    v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3262; GFX1032-NEXT:    v_readlane_b32 s3, v2, 15
3263; GFX1032-NEXT:    v_readlane_b32 s4, v2, 31
3264; GFX1032-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3265; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3266; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3267; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3268; GFX1032-NEXT:    v_writelane_b32 v1, s3, 16
3269; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3270; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
3271; GFX1032-NEXT:    s_mov_b32 s2, -1
3272; GFX1032-NEXT:    ; implicit-def: $vgpr0
3273; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
3274; GFX1032-NEXT:    s_cbranch_execz BB17_2
3275; GFX1032-NEXT:  ; %bb.1:
3276; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
3277; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
3278; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3279; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
3280; GFX1032-NEXT:    ds_max_rtn_i32 v0, v0, v4
3281; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3282; GFX1032-NEXT:    buffer_gl0_inv
3283; GFX1032-NEXT:  BB17_2:
3284; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
3285; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
3286; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
3287; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
3288; GFX1032-NEXT:    v_max_i32_e32 v0, s3, v0
3289; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
3290; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3291; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3292; GFX1032-NEXT:    s_endpgm
3293entry:
3294  %lane = call i32 @llvm.amdgcn.workitem.id.x()
3295  %old = atomicrmw max i32 addrspace(3)* @local_var32, i32 %lane acq_rel
3296  store i32 %old, i32 addrspace(1)* %out
3297  ret void
3298}
3299
3300define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) {
3301;
3302;
3303; GFX7LESS-LABEL: max_i64_constant:
3304; GFX7LESS:       ; %bb.0: ; %entry
3305; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3306; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
3307; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
3308; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3309; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
3310; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3311; GFX7LESS-NEXT:    s_cbranch_execz BB18_2
3312; GFX7LESS-NEXT:  ; %bb.1:
3313; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0
3314; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
3315; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
3316; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3317; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3318; GFX7LESS-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
3319; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3320; GFX7LESS-NEXT:  BB18_2:
3321; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
3322; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3323; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
3324; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
3325; GFX7LESS-NEXT:    v_bfrev_b32_e32 v1, 1
3326; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
3327; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3328; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
3329; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
3330; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s4
3331; GFX7LESS-NEXT:    v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1]
3332; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
3333; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
3334; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3335; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3336; GFX7LESS-NEXT:    s_endpgm
3337;
3338; GFX8-LABEL: max_i64_constant:
3339; GFX8:       ; %bb.0: ; %entry
3340; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3341; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3342; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3343; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3344; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
3345; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3346; GFX8-NEXT:    s_cbranch_execz BB18_2
3347; GFX8-NEXT:  ; %bb.1:
3348; GFX8-NEXT:    v_mov_b32_e32 v0, 5
3349; GFX8-NEXT:    v_mov_b32_e32 v2, 0
3350; GFX8-NEXT:    v_mov_b32_e32 v1, 0
3351; GFX8-NEXT:    s_mov_b32 m0, -1
3352; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3353; GFX8-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
3354; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3355; GFX8-NEXT:  BB18_2:
3356; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
3357; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3358; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
3359; GFX8-NEXT:    v_bfrev_b32_e32 v0, 1
3360; GFX8-NEXT:    v_readfirstlane_b32 s3, v1
3361; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
3362; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
3363; GFX8-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
3364; GFX8-NEXT:    v_mov_b32_e32 v2, s3
3365; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
3366; GFX8-NEXT:    v_mov_b32_e32 v2, s2
3367; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
3368; GFX8-NEXT:    s_mov_b32 s3, 0xf000
3369; GFX8-NEXT:    s_mov_b32 s2, -1
3370; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3371; GFX8-NEXT:    s_endpgm
3372;
3373; GFX9-LABEL: max_i64_constant:
3374; GFX9:       ; %bb.0: ; %entry
3375; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3376; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3377; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3378; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3379; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
3380; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3381; GFX9-NEXT:    s_cbranch_execz BB18_2
3382; GFX9-NEXT:  ; %bb.1:
3383; GFX9-NEXT:    v_mov_b32_e32 v0, 5
3384; GFX9-NEXT:    v_mov_b32_e32 v1, 0
3385; GFX9-NEXT:    v_mov_b32_e32 v2, 0
3386; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3387; GFX9-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
3388; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3389; GFX9-NEXT:  BB18_2:
3390; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
3391; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3392; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
3393; GFX9-NEXT:    v_bfrev_b32_e32 v0, 1
3394; GFX9-NEXT:    v_readfirstlane_b32 s3, v1
3395; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
3396; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
3397; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
3398; GFX9-NEXT:    v_mov_b32_e32 v2, s3
3399; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
3400; GFX9-NEXT:    v_mov_b32_e32 v2, s2
3401; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
3402; GFX9-NEXT:    s_mov_b32 s3, 0xf000
3403; GFX9-NEXT:    s_mov_b32 s2, -1
3404; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3405; GFX9-NEXT:    s_endpgm
3406;
3407; GFX1064-LABEL: max_i64_constant:
3408; GFX1064:       ; %bb.0: ; %entry
3409; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3410; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3411; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3412; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3413; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
3414; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3415; GFX1064-NEXT:    s_cbranch_execz BB18_2
3416; GFX1064-NEXT:  ; %bb.1:
3417; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
3418; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
3419; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
3420; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3421; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
3422; GFX1064-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
3423; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3424; GFX1064-NEXT:    buffer_gl0_inv
3425; GFX1064-NEXT:  BB18_2:
3426; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
3427; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
3428; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
3429; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
3430; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, 0x80000000, vcc
3431; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
3432; GFX1064-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
3433; GFX1064-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
3434; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
3435; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
3436; GFX1064-NEXT:    s_mov_b32 s2, -1
3437; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3438; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3439; GFX1064-NEXT:    s_endpgm
3440;
3441; GFX1032-LABEL: max_i64_constant:
3442; GFX1032:       ; %bb.0: ; %entry
3443; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3444; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3445; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
3446; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
3447; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
3448; GFX1032-NEXT:    s_cbranch_execz BB18_2
3449; GFX1032-NEXT:  ; %bb.1:
3450; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
3451; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
3452; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
3453; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3454; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
3455; GFX1032-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
3456; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3457; GFX1032-NEXT:    buffer_gl0_inv
3458; GFX1032-NEXT:  BB18_2:
3459; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
3460; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
3461; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
3462; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
3463; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo
3464; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc_lo
3465; GFX1032-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1]
3466; GFX1032-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
3467; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
3468; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
3469; GFX1032-NEXT:    s_mov_b32 s2, -1
3470; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3471; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3472; GFX1032-NEXT:    s_endpgm
3473entry:
3474  %old = atomicrmw max i64 addrspace(3)* @local_var64, i64 5 acq_rel
3475  store i64 %old, i64 addrspace(1)* %out
3476  ret void
3477}
3478
3479define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
3480;
3481;
3482; GFX7LESS-LABEL: min_i32_varying:
3483; GFX7LESS:       ; %bb.0: ; %entry
3484; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3485; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
3486; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3487; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3488; GFX7LESS-NEXT:    ds_min_rtn_i32 v0, v1, v0
3489; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3490; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3491; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3492; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3493; GFX7LESS-NEXT:    s_endpgm
3494;
3495; GFX8-LABEL: min_i32_varying:
3496; GFX8:       ; %bb.0: ; %entry
3497; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3498; GFX8-NEXT:    v_mov_b32_e32 v2, v0
3499; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3500; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3501; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3502; GFX8-NEXT:    v_bfrev_b32_e32 v1, -2
3503; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3504; GFX8-NEXT:    s_not_b64 exec, exec
3505; GFX8-NEXT:    v_mov_b32_e32 v2, v1
3506; GFX8-NEXT:    s_not_b64 exec, exec
3507; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3508; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3509; GFX8-NEXT:    s_nop 1
3510; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3511; GFX8-NEXT:    s_nop 1
3512; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3513; GFX8-NEXT:    s_nop 1
3514; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3515; GFX8-NEXT:    s_nop 1
3516; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3517; GFX8-NEXT:    s_nop 1
3518; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3519; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
3520; GFX8-NEXT:    s_nop 0
3521; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3522; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3523; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3524; GFX8-NEXT:    ; implicit-def: $vgpr0
3525; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3526; GFX8-NEXT:    s_cbranch_execz BB19_2
3527; GFX8-NEXT:  ; %bb.1:
3528; GFX8-NEXT:    v_mov_b32_e32 v0, 0
3529; GFX8-NEXT:    v_mov_b32_e32 v3, s4
3530; GFX8-NEXT:    s_mov_b32 m0, -1
3531; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3532; GFX8-NEXT:    ds_min_rtn_i32 v0, v0, v3
3533; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3534; GFX8-NEXT:  BB19_2:
3535; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
3536; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3537; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
3538; GFX8-NEXT:    v_mov_b32_e32 v0, v1
3539; GFX8-NEXT:    v_min_i32_e32 v0, s2, v0
3540; GFX8-NEXT:    s_mov_b32 s3, 0xf000
3541; GFX8-NEXT:    s_mov_b32 s2, -1
3542; GFX8-NEXT:    s_nop 0
3543; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3544; GFX8-NEXT:    s_endpgm
3545;
3546; GFX9-LABEL: min_i32_varying:
3547; GFX9:       ; %bb.0: ; %entry
3548; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3549; GFX9-NEXT:    v_mov_b32_e32 v2, v0
3550; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3551; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3552; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3553; GFX9-NEXT:    v_bfrev_b32_e32 v1, -2
3554; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3555; GFX9-NEXT:    s_not_b64 exec, exec
3556; GFX9-NEXT:    v_mov_b32_e32 v2, v1
3557; GFX9-NEXT:    s_not_b64 exec, exec
3558; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3559; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3560; GFX9-NEXT:    s_nop 1
3561; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3562; GFX9-NEXT:    s_nop 1
3563; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3564; GFX9-NEXT:    s_nop 1
3565; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3566; GFX9-NEXT:    s_nop 1
3567; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3568; GFX9-NEXT:    s_nop 1
3569; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3570; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
3571; GFX9-NEXT:    s_nop 0
3572; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3573; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3574; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3575; GFX9-NEXT:    ; implicit-def: $vgpr0
3576; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3577; GFX9-NEXT:    s_cbranch_execz BB19_2
3578; GFX9-NEXT:  ; %bb.1:
3579; GFX9-NEXT:    v_mov_b32_e32 v0, 0
3580; GFX9-NEXT:    v_mov_b32_e32 v3, s4
3581; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3582; GFX9-NEXT:    ds_min_rtn_i32 v0, v0, v3
3583; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3584; GFX9-NEXT:  BB19_2:
3585; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
3586; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3587; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
3588; GFX9-NEXT:    v_mov_b32_e32 v0, v1
3589; GFX9-NEXT:    v_min_i32_e32 v0, s2, v0
3590; GFX9-NEXT:    s_mov_b32 s3, 0xf000
3591; GFX9-NEXT:    s_mov_b32 s2, -1
3592; GFX9-NEXT:    s_nop 0
3593; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3594; GFX9-NEXT:    s_endpgm
3595;
3596; GFX1064-LABEL: min_i32_varying:
3597; GFX1064:       ; %bb.0: ; %entry
3598; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
3599; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3600; GFX1064-NEXT:    v_bfrev_b32_e32 v1, -2
3601; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3602; GFX1064-NEXT:    s_not_b64 exec, exec
3603; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
3604; GFX1064-NEXT:    s_not_b64 exec, exec
3605; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3606; GFX1064-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3607; GFX1064-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3608; GFX1064-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3609; GFX1064-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3610; GFX1064-NEXT:    v_mov_b32_e32 v3, v2
3611; GFX1064-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
3612; GFX1064-NEXT:    v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3613; GFX1064-NEXT:    v_readlane_b32 s4, v2, 31
3614; GFX1064-NEXT:    v_mov_b32_e32 v3, s4
3615; GFX1064-NEXT:    v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
3616; GFX1064-NEXT:    v_readlane_b32 s4, v2, 15
3617; GFX1064-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3618; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3619; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3620; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3621; GFX1064-NEXT:    v_readlane_b32 s5, v2, 31
3622; GFX1064-NEXT:    v_writelane_b32 v1, s4, 16
3623; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3624; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3625; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3626; GFX1064-NEXT:    v_readlane_b32 s7, v2, 63
3627; GFX1064-NEXT:    v_readlane_b32 s6, v2, 47
3628; GFX1064-NEXT:    v_writelane_b32 v1, s5, 32
3629; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3630; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3631; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
3632; GFX1064-NEXT:    v_writelane_b32 v1, s6, 48
3633; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
3634; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3635; GFX1064-NEXT:    s_mov_b32 s2, -1
3636; GFX1064-NEXT:    ; implicit-def: $vgpr0
3637; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3638; GFX1064-NEXT:    s_cbranch_execz BB19_2
3639; GFX1064-NEXT:  ; %bb.1:
3640; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
3641; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
3642; GFX1064-NEXT:    s_mov_b32 s3, s7
3643; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3644; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
3645; GFX1064-NEXT:    ds_min_rtn_i32 v0, v0, v4
3646; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3647; GFX1064-NEXT:    buffer_gl0_inv
3648; GFX1064-NEXT:  BB19_2:
3649; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
3650; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
3651; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
3652; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
3653; GFX1064-NEXT:    v_min_i32_e32 v0, s3, v0
3654; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
3655; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3656; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3657; GFX1064-NEXT:    s_endpgm
3658;
3659; GFX1032-LABEL: min_i32_varying:
3660; GFX1032:       ; %bb.0: ; %entry
3661; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
3662; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3663; GFX1032-NEXT:    v_bfrev_b32_e32 v1, -2
3664; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3665; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3666; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
3667; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3668; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3669; GFX1032-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3670; GFX1032-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3671; GFX1032-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3672; GFX1032-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3673; GFX1032-NEXT:    v_mov_b32_e32 v3, v2
3674; GFX1032-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
3675; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3676; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3677; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3678; GFX1032-NEXT:    v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3679; GFX1032-NEXT:    v_readlane_b32 s3, v2, 15
3680; GFX1032-NEXT:    v_readlane_b32 s4, v2, 31
3681; GFX1032-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3682; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3683; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3684; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3685; GFX1032-NEXT:    v_writelane_b32 v1, s3, 16
3686; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3687; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
3688; GFX1032-NEXT:    s_mov_b32 s2, -1
3689; GFX1032-NEXT:    ; implicit-def: $vgpr0
3690; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
3691; GFX1032-NEXT:    s_cbranch_execz BB19_2
3692; GFX1032-NEXT:  ; %bb.1:
3693; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
3694; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
3695; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3696; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
3697; GFX1032-NEXT:    ds_min_rtn_i32 v0, v0, v4
3698; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3699; GFX1032-NEXT:    buffer_gl0_inv
3700; GFX1032-NEXT:  BB19_2:
3701; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
3702; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
3703; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
3704; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
3705; GFX1032-NEXT:    v_min_i32_e32 v0, s3, v0
3706; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
3707; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3708; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3709; GFX1032-NEXT:    s_endpgm
3710entry:
3711  %lane = call i32 @llvm.amdgcn.workitem.id.x()
3712  %old = atomicrmw min i32 addrspace(3)* @local_var32, i32 %lane acq_rel
3713  store i32 %old, i32 addrspace(1)* %out
3714  ret void
3715}
3716
3717define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) {
3718;
3719;
3720; GFX7LESS-LABEL: min_i64_constant:
3721; GFX7LESS:       ; %bb.0: ; %entry
3722; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3723; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
3724; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
3725; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3726; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
3727; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3728; GFX7LESS-NEXT:    s_cbranch_execz BB20_2
3729; GFX7LESS-NEXT:  ; %bb.1:
3730; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0
3731; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
3732; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
3733; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3734; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3735; GFX7LESS-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
3736; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3737; GFX7LESS-NEXT:  BB20_2:
3738; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
3739; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3740; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
3741; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
3742; GFX7LESS-NEXT:    v_bfrev_b32_e32 v1, -2
3743; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3744; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
3745; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
3746; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
3747; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s4
3748; GFX7LESS-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
3749; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
3750; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
3751; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3752; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3753; GFX7LESS-NEXT:    s_endpgm
3754;
3755; GFX8-LABEL: min_i64_constant:
3756; GFX8:       ; %bb.0: ; %entry
3757; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3758; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3759; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3760; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3761; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
3762; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3763; GFX8-NEXT:    s_cbranch_execz BB20_2
3764; GFX8-NEXT:  ; %bb.1:
3765; GFX8-NEXT:    v_mov_b32_e32 v0, 5
3766; GFX8-NEXT:    v_mov_b32_e32 v2, 0
3767; GFX8-NEXT:    v_mov_b32_e32 v1, 0
3768; GFX8-NEXT:    s_mov_b32 m0, -1
3769; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3770; GFX8-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
3771; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3772; GFX8-NEXT:  BB20_2:
3773; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
3774; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3775; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
3776; GFX8-NEXT:    v_bfrev_b32_e32 v0, -2
3777; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
3778; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
3779; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
3780; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
3781; GFX8-NEXT:    v_mov_b32_e32 v2, s5
3782; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
3783; GFX8-NEXT:    v_mov_b32_e32 v2, s4
3784; GFX8-NEXT:    s_mov_b32 s2, -1
3785; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
3786; GFX8-NEXT:    s_mov_b32 s3, 0xf000
3787; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3788; GFX8-NEXT:    s_endpgm
3789;
3790; GFX9-LABEL: min_i64_constant:
3791; GFX9:       ; %bb.0: ; %entry
3792; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3793; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3794; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3795; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3796; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
3797; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3798; GFX9-NEXT:    s_cbranch_execz BB20_2
3799; GFX9-NEXT:  ; %bb.1:
3800; GFX9-NEXT:    v_mov_b32_e32 v0, 5
3801; GFX9-NEXT:    v_mov_b32_e32 v1, 0
3802; GFX9-NEXT:    v_mov_b32_e32 v2, 0
3803; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3804; GFX9-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
3805; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3806; GFX9-NEXT:  BB20_2:
3807; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
3808; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3809; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
3810; GFX9-NEXT:    v_bfrev_b32_e32 v0, -2
3811; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
3812; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
3813; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
3814; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
3815; GFX9-NEXT:    v_mov_b32_e32 v2, s5
3816; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
3817; GFX9-NEXT:    v_mov_b32_e32 v2, s4
3818; GFX9-NEXT:    s_mov_b32 s2, -1
3819; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
3820; GFX9-NEXT:    s_mov_b32 s3, 0xf000
3821; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3822; GFX9-NEXT:    s_endpgm
3823;
3824; GFX1064-LABEL: min_i64_constant:
3825; GFX1064:       ; %bb.0: ; %entry
3826; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3827; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3828; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3829; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3830; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
3831; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3832; GFX1064-NEXT:    s_cbranch_execz BB20_2
3833; GFX1064-NEXT:  ; %bb.1:
3834; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
3835; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
3836; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
3837; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3838; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
3839; GFX1064-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
3840; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3841; GFX1064-NEXT:    buffer_gl0_inv
3842; GFX1064-NEXT:  BB20_2:
3843; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
3844; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
3845; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
3846; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
3847; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc
3848; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
3849; GFX1064-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
3850; GFX1064-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
3851; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
3852; GFX1064-NEXT:    s_mov_b32 s2, -1
3853; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
3854; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3855; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3856; GFX1064-NEXT:    s_endpgm
3857;
3858; GFX1032-LABEL: min_i64_constant:
3859; GFX1032:       ; %bb.0: ; %entry
3860; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3861; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3862; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
3863; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
3864; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
3865; GFX1032-NEXT:    s_cbranch_execz BB20_2
3866; GFX1032-NEXT:  ; %bb.1:
3867; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
3868; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
3869; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
3870; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3871; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
3872; GFX1032-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
3873; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3874; GFX1032-NEXT:    buffer_gl0_inv
3875; GFX1032-NEXT:  BB20_2:
3876; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
3877; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
3878; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
3879; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
3880; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo
3881; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc_lo
3882; GFX1032-NEXT:    v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1]
3883; GFX1032-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
3884; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
3885; GFX1032-NEXT:    s_mov_b32 s2, -1
3886; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
3887; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3888; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3889; GFX1032-NEXT:    s_endpgm
3890entry:
3891  %old = atomicrmw min i64 addrspace(3)* @local_var64, i64 5 acq_rel
3892  store i64 %old, i64 addrspace(1)* %out
3893  ret void
3894}
3895
3896define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
3897;
3898;
3899; GFX7LESS-LABEL: umax_i32_varying:
3900; GFX7LESS:       ; %bb.0: ; %entry
3901; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3902; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
3903; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3904; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3905; GFX7LESS-NEXT:    ds_max_rtn_u32 v0, v1, v0
3906; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3907; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3908; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3909; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3910; GFX7LESS-NEXT:    s_endpgm
3911;
3912; GFX8-LABEL: umax_i32_varying:
3913; GFX8:       ; %bb.0: ; %entry
3914; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3915; GFX8-NEXT:    v_mov_b32_e32 v2, v0
3916; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3917; GFX8-NEXT:    v_mov_b32_e32 v1, 0
3918; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3919; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3920; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3921; GFX8-NEXT:    s_not_b64 exec, exec
3922; GFX8-NEXT:    v_mov_b32_e32 v2, 0
3923; GFX8-NEXT:    s_not_b64 exec, exec
3924; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3925; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
3926; GFX8-NEXT:    s_nop 1
3927; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
3928; GFX8-NEXT:    s_nop 1
3929; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
3930; GFX8-NEXT:    s_nop 1
3931; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
3932; GFX8-NEXT:    s_nop 1
3933; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3934; GFX8-NEXT:    s_nop 1
3935; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3936; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
3937; GFX8-NEXT:    s_nop 0
3938; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3939; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3940; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3941; GFX8-NEXT:    ; implicit-def: $vgpr0
3942; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3943; GFX8-NEXT:    s_cbranch_execz BB21_2
3944; GFX8-NEXT:  ; %bb.1:
3945; GFX8-NEXT:    v_mov_b32_e32 v0, 0
3946; GFX8-NEXT:    v_mov_b32_e32 v3, s4
3947; GFX8-NEXT:    s_mov_b32 m0, -1
3948; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3949; GFX8-NEXT:    ds_max_rtn_u32 v0, v0, v3
3950; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3951; GFX8-NEXT:  BB21_2:
3952; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
3953; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3954; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
3955; GFX8-NEXT:    v_mov_b32_e32 v0, v1
3956; GFX8-NEXT:    v_max_u32_e32 v0, s2, v0
3957; GFX8-NEXT:    s_mov_b32 s3, 0xf000
3958; GFX8-NEXT:    s_mov_b32 s2, -1
3959; GFX8-NEXT:    s_nop 0
3960; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3961; GFX8-NEXT:    s_endpgm
3962;
3963; GFX9-LABEL: umax_i32_varying:
3964; GFX9:       ; %bb.0: ; %entry
3965; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3966; GFX9-NEXT:    v_mov_b32_e32 v2, v0
3967; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3968; GFX9-NEXT:    v_mov_b32_e32 v1, 0
3969; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3970; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3971; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3972; GFX9-NEXT:    s_not_b64 exec, exec
3973; GFX9-NEXT:    v_mov_b32_e32 v2, 0
3974; GFX9-NEXT:    s_not_b64 exec, exec
3975; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3976; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
3977; GFX9-NEXT:    s_nop 1
3978; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
3979; GFX9-NEXT:    s_nop 1
3980; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
3981; GFX9-NEXT:    s_nop 1
3982; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
3983; GFX9-NEXT:    s_nop 1
3984; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3985; GFX9-NEXT:    s_nop 1
3986; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3987; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
3988; GFX9-NEXT:    s_nop 0
3989; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3990; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3991; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3992; GFX9-NEXT:    ; implicit-def: $vgpr0
3993; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3994; GFX9-NEXT:    s_cbranch_execz BB21_2
3995; GFX9-NEXT:  ; %bb.1:
3996; GFX9-NEXT:    v_mov_b32_e32 v0, 0
3997; GFX9-NEXT:    v_mov_b32_e32 v3, s4
3998; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3999; GFX9-NEXT:    ds_max_rtn_u32 v0, v0, v3
4000; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4001; GFX9-NEXT:  BB21_2:
4002; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
4003; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4004; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
4005; GFX9-NEXT:    v_mov_b32_e32 v0, v1
4006; GFX9-NEXT:    v_max_u32_e32 v0, s2, v0
4007; GFX9-NEXT:    s_mov_b32 s3, 0xf000
4008; GFX9-NEXT:    s_mov_b32 s2, -1
4009; GFX9-NEXT:    s_nop 0
4010; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4011; GFX9-NEXT:    s_endpgm
4012;
4013; GFX1064-LABEL: umax_i32_varying:
4014; GFX1064:       ; %bb.0: ; %entry
4015; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
4016; GFX1064-NEXT:    s_not_b64 exec, exec
4017; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
4018; GFX1064-NEXT:    s_not_b64 exec, exec
4019; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4020; GFX1064-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
4021; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
4022; GFX1064-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
4023; GFX1064-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
4024; GFX1064-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
4025; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
4026; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4027; GFX1064-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4028; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
4029; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
4030; GFX1064-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
4031; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
4032; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4033; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4034; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4035; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4036; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
4037; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
4038; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4039; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4040; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4041; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
4042; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
4043; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
4044; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4045; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4046; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
4047; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
4048; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
4049; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4050; GFX1064-NEXT:    s_mov_b32 s2, -1
4051; GFX1064-NEXT:    ; implicit-def: $vgpr0
4052; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4053; GFX1064-NEXT:    s_cbranch_execz BB21_2
4054; GFX1064-NEXT:  ; %bb.1:
4055; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
4056; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
4057; GFX1064-NEXT:    s_mov_b32 s3, s7
4058; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4059; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
4060; GFX1064-NEXT:    ds_max_rtn_u32 v0, v0, v4
4061; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4062; GFX1064-NEXT:    buffer_gl0_inv
4063; GFX1064-NEXT:  BB21_2:
4064; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
4065; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
4066; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
4067; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
4068; GFX1064-NEXT:    v_max_u32_e32 v0, s3, v0
4069; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
4070; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4071; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4072; GFX1064-NEXT:    s_endpgm
4073;
4074; GFX1032-LABEL: umax_i32_varying:
4075; GFX1032:       ; %bb.0: ; %entry
4076; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
4077; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
4078; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
4079; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
4080; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4081; GFX1032-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
4082; GFX1032-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
4083; GFX1032-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
4084; GFX1032-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
4085; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
4086; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4087; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4088; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4089; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4090; GFX1032-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4091; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
4092; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
4093; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
4094; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4095; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4096; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4097; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4098; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
4099; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4100; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4101; GFX1032-NEXT:    s_mov_b32 s2, -1
4102; GFX1032-NEXT:    ; implicit-def: $vgpr0
4103; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
4104; GFX1032-NEXT:    s_cbranch_execz BB21_2
4105; GFX1032-NEXT:  ; %bb.1:
4106; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
4107; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
4108; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4109; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
4110; GFX1032-NEXT:    ds_max_rtn_u32 v0, v0, v4
4111; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4112; GFX1032-NEXT:    buffer_gl0_inv
4113; GFX1032-NEXT:  BB21_2:
4114; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
4115; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
4116; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
4117; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
4118; GFX1032-NEXT:    v_max_u32_e32 v0, s3, v0
4119; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
4120; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4121; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4122; GFX1032-NEXT:    s_endpgm
4123entry:
4124  %lane = call i32 @llvm.amdgcn.workitem.id.x()
4125  %old = atomicrmw umax i32 addrspace(3)* @local_var32, i32 %lane acq_rel
4126  store i32 %old, i32 addrspace(1)* %out
4127  ret void
4128}
4129
4130define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) {
4131;
4132;
4133; GFX7LESS-LABEL: umax_i64_constant:
4134; GFX7LESS:       ; %bb.0: ; %entry
4135; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4136; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4137; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
4138; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4139; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
4140; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4141; GFX7LESS-NEXT:    s_cbranch_execz BB22_2
4142; GFX7LESS-NEXT:  ; %bb.1:
4143; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0
4144; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
4145; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
4146; GFX7LESS-NEXT:    s_mov_b32 m0, -1
4147; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4148; GFX7LESS-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
4149; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4150; GFX7LESS-NEXT:  BB22_2:
4151; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
4152; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4153; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
4154; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
4155; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
4156; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
4157; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
4158; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
4159; GFX7LESS-NEXT:    v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
4160; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4161; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s5
4162; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
4163; GFX7LESS-NEXT:    s_mov_b32 s2, -1
4164; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4165; GFX7LESS-NEXT:    s_endpgm
4166;
4167; GFX8-LABEL: umax_i64_constant:
4168; GFX8:       ; %bb.0: ; %entry
4169; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4170; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4171; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4172; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4173; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
4174; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4175; GFX8-NEXT:    s_cbranch_execz BB22_2
4176; GFX8-NEXT:  ; %bb.1:
4177; GFX8-NEXT:    v_mov_b32_e32 v0, 5
4178; GFX8-NEXT:    v_mov_b32_e32 v2, 0
4179; GFX8-NEXT:    v_mov_b32_e32 v1, 0
4180; GFX8-NEXT:    s_mov_b32 m0, -1
4181; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4182; GFX8-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
4183; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4184; GFX8-NEXT:  BB22_2:
4185; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
4186; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4187; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
4188; GFX8-NEXT:    v_readfirstlane_b32 s3, v1
4189; GFX8-NEXT:    v_mov_b32_e32 v1, 0
4190; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
4191; GFX8-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
4192; GFX8-NEXT:    v_mov_b32_e32 v1, s3
4193; GFX8-NEXT:    v_mov_b32_e32 v2, s2
4194; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4195; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
4196; GFX8-NEXT:    s_mov_b32 s3, 0xf000
4197; GFX8-NEXT:    s_mov_b32 s2, -1
4198; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4199; GFX8-NEXT:    s_endpgm
4200;
4201; GFX9-LABEL: umax_i64_constant:
4202; GFX9:       ; %bb.0: ; %entry
4203; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4204; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4205; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4206; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4207; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
4208; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4209; GFX9-NEXT:    s_cbranch_execz BB22_2
4210; GFX9-NEXT:  ; %bb.1:
4211; GFX9-NEXT:    v_mov_b32_e32 v0, 5
4212; GFX9-NEXT:    v_mov_b32_e32 v1, 0
4213; GFX9-NEXT:    v_mov_b32_e32 v2, 0
4214; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4215; GFX9-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
4216; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4217; GFX9-NEXT:  BB22_2:
4218; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
4219; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4220; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
4221; GFX9-NEXT:    v_readfirstlane_b32 s3, v1
4222; GFX9-NEXT:    v_mov_b32_e32 v1, 0
4223; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
4224; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
4225; GFX9-NEXT:    v_mov_b32_e32 v1, s3
4226; GFX9-NEXT:    v_mov_b32_e32 v2, s2
4227; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4228; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
4229; GFX9-NEXT:    s_mov_b32 s3, 0xf000
4230; GFX9-NEXT:    s_mov_b32 s2, -1
4231; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4232; GFX9-NEXT:    s_endpgm
4233;
4234; GFX1064-LABEL: umax_i64_constant:
4235; GFX1064:       ; %bb.0: ; %entry
4236; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4237; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4238; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4239; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4240; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
4241; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4242; GFX1064-NEXT:    s_cbranch_execz BB22_2
4243; GFX1064-NEXT:  ; %bb.1:
4244; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
4245; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
4246; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
4247; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4248; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
4249; GFX1064-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
4250; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4251; GFX1064-NEXT:    buffer_gl0_inv
4252; GFX1064-NEXT:  BB22_2:
4253; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
4254; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
4255; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
4256; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
4257; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
4258; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
4259; GFX1064-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
4260; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
4261; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, s3, vcc
4262; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
4263; GFX1064-NEXT:    s_mov_b32 s2, -1
4264; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4265; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4266; GFX1064-NEXT:    s_endpgm
4267;
4268; GFX1032-LABEL: umax_i64_constant:
4269; GFX1032:       ; %bb.0: ; %entry
4270; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4271; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4272; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4273; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
4274; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
4275; GFX1032-NEXT:    s_cbranch_execz BB22_2
4276; GFX1032-NEXT:  ; %bb.1:
4277; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
4278; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
4279; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
4280; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4281; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
4282; GFX1032-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
4283; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4284; GFX1032-NEXT:    buffer_gl0_inv
4285; GFX1032-NEXT:  BB22_2:
4286; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
4287; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
4288; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
4289; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
4290; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
4291; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc_lo
4292; GFX1032-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1]
4293; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
4294; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, s3, vcc_lo
4295; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
4296; GFX1032-NEXT:    s_mov_b32 s2, -1
4297; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4298; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4299; GFX1032-NEXT:    s_endpgm
4300entry:
4301  %old = atomicrmw umax i64 addrspace(3)* @local_var64, i64 5 acq_rel
4302  store i64 %old, i64 addrspace(1)* %out
4303  ret void
4304}
4305
4306define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) {
4307;
4308;
4309; GFX7LESS-LABEL: umin_i32_varying:
4310; GFX7LESS:       ; %bb.0: ; %entry
4311; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4312; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
4313; GFX7LESS-NEXT:    s_mov_b32 m0, -1
4314; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4315; GFX7LESS-NEXT:    ds_min_rtn_u32 v0, v1, v0
4316; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4317; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
4318; GFX7LESS-NEXT:    s_mov_b32 s2, -1
4319; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4320; GFX7LESS-NEXT:    s_endpgm
4321;
4322; GFX8-LABEL: umin_i32_varying:
4323; GFX8:       ; %bb.0: ; %entry
4324; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4325; GFX8-NEXT:    v_mov_b32_e32 v2, v0
4326; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4327; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4328; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
4329; GFX8-NEXT:    v_mov_b32_e32 v1, -1
4330; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
4331; GFX8-NEXT:    s_not_b64 exec, exec
4332; GFX8-NEXT:    v_mov_b32_e32 v2, -1
4333; GFX8-NEXT:    s_not_b64 exec, exec
4334; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
4335; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
4336; GFX8-NEXT:    s_nop 1
4337; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
4338; GFX8-NEXT:    s_nop 1
4339; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
4340; GFX8-NEXT:    s_nop 1
4341; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
4342; GFX8-NEXT:    s_nop 1
4343; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
4344; GFX8-NEXT:    s_nop 1
4345; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
4346; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
4347; GFX8-NEXT:    s_nop 0
4348; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
4349; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
4350; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4351; GFX8-NEXT:    ; implicit-def: $vgpr0
4352; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4353; GFX8-NEXT:    s_cbranch_execz BB23_2
4354; GFX8-NEXT:  ; %bb.1:
4355; GFX8-NEXT:    v_mov_b32_e32 v0, 0
4356; GFX8-NEXT:    v_mov_b32_e32 v3, s4
4357; GFX8-NEXT:    s_mov_b32 m0, -1
4358; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4359; GFX8-NEXT:    ds_min_rtn_u32 v0, v0, v3
4360; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4361; GFX8-NEXT:  BB23_2:
4362; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
4363; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4364; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
4365; GFX8-NEXT:    v_mov_b32_e32 v0, v1
4366; GFX8-NEXT:    v_min_u32_e32 v0, s2, v0
4367; GFX8-NEXT:    s_mov_b32 s3, 0xf000
4368; GFX8-NEXT:    s_mov_b32 s2, -1
4369; GFX8-NEXT:    s_nop 0
4370; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4371; GFX8-NEXT:    s_endpgm
4372;
4373; GFX9-LABEL: umin_i32_varying:
4374; GFX9:       ; %bb.0: ; %entry
4375; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4376; GFX9-NEXT:    v_mov_b32_e32 v2, v0
4377; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4378; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4379; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
4380; GFX9-NEXT:    v_mov_b32_e32 v1, -1
4381; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
4382; GFX9-NEXT:    s_not_b64 exec, exec
4383; GFX9-NEXT:    v_mov_b32_e32 v2, -1
4384; GFX9-NEXT:    s_not_b64 exec, exec
4385; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
4386; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
4387; GFX9-NEXT:    s_nop 1
4388; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
4389; GFX9-NEXT:    s_nop 1
4390; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
4391; GFX9-NEXT:    s_nop 1
4392; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
4393; GFX9-NEXT:    s_nop 1
4394; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
4395; GFX9-NEXT:    s_nop 1
4396; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
4397; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
4398; GFX9-NEXT:    s_nop 0
4399; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
4400; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
4401; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4402; GFX9-NEXT:    ; implicit-def: $vgpr0
4403; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4404; GFX9-NEXT:    s_cbranch_execz BB23_2
4405; GFX9-NEXT:  ; %bb.1:
4406; GFX9-NEXT:    v_mov_b32_e32 v0, 0
4407; GFX9-NEXT:    v_mov_b32_e32 v3, s4
4408; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4409; GFX9-NEXT:    ds_min_rtn_u32 v0, v0, v3
4410; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4411; GFX9-NEXT:  BB23_2:
4412; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
4413; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4414; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
4415; GFX9-NEXT:    v_mov_b32_e32 v0, v1
4416; GFX9-NEXT:    v_min_u32_e32 v0, s2, v0
4417; GFX9-NEXT:    s_mov_b32 s3, 0xf000
4418; GFX9-NEXT:    s_mov_b32 s2, -1
4419; GFX9-NEXT:    s_nop 0
4420; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4421; GFX9-NEXT:    s_endpgm
4422;
4423; GFX1064-LABEL: umin_i32_varying:
4424; GFX1064:       ; %bb.0: ; %entry
4425; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
4426; GFX1064-NEXT:    s_not_b64 exec, exec
4427; GFX1064-NEXT:    v_mov_b32_e32 v1, -1
4428; GFX1064-NEXT:    s_not_b64 exec, exec
4429; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4430; GFX1064-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4431; GFX1064-NEXT:    v_mov_b32_e32 v3, -1
4432; GFX1064-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
4433; GFX1064-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
4434; GFX1064-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
4435; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
4436; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4437; GFX1064-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4438; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
4439; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
4440; GFX1064-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
4441; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
4442; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4443; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4444; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4445; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4446; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
4447; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
4448; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4449; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4450; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4451; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
4452; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
4453; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
4454; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4455; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4456; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
4457; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
4458; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
4459; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4460; GFX1064-NEXT:    s_mov_b32 s2, -1
4461; GFX1064-NEXT:    ; implicit-def: $vgpr0
4462; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4463; GFX1064-NEXT:    s_cbranch_execz BB23_2
4464; GFX1064-NEXT:  ; %bb.1:
4465; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
4466; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
4467; GFX1064-NEXT:    s_mov_b32 s3, s7
4468; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4469; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
4470; GFX1064-NEXT:    ds_min_rtn_u32 v0, v0, v4
4471; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4472; GFX1064-NEXT:    buffer_gl0_inv
4473; GFX1064-NEXT:  BB23_2:
4474; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
4475; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
4476; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
4477; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
4478; GFX1064-NEXT:    v_min_u32_e32 v0, s3, v0
4479; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
4480; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4481; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4482; GFX1064-NEXT:    s_endpgm
4483;
4484; GFX1032-LABEL: umin_i32_varying:
4485; GFX1032:       ; %bb.0: ; %entry
4486; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
4487; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
4488; GFX1032-NEXT:    v_mov_b32_e32 v1, -1
4489; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
4490; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4491; GFX1032-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4492; GFX1032-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
4493; GFX1032-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
4494; GFX1032-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
4495; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
4496; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4497; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4498; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4499; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4500; GFX1032-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4501; GFX1032-NEXT:    v_mov_b32_e32 v3, -1
4502; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
4503; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
4504; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4505; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4506; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4507; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4508; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
4509; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4510; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4511; GFX1032-NEXT:    s_mov_b32 s2, -1
4512; GFX1032-NEXT:    ; implicit-def: $vgpr0
4513; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
4514; GFX1032-NEXT:    s_cbranch_execz BB23_2
4515; GFX1032-NEXT:  ; %bb.1:
4516; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
4517; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
4518; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4519; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
4520; GFX1032-NEXT:    ds_min_rtn_u32 v0, v0, v4
4521; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4522; GFX1032-NEXT:    buffer_gl0_inv
4523; GFX1032-NEXT:  BB23_2:
4524; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
4525; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
4526; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
4527; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
4528; GFX1032-NEXT:    v_min_u32_e32 v0, s3, v0
4529; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
4530; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4531; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4532; GFX1032-NEXT:    s_endpgm
4533entry:
4534  %lane = call i32 @llvm.amdgcn.workitem.id.x()
4535  %old = atomicrmw umin i32 addrspace(3)* @local_var32, i32 %lane acq_rel
4536  store i32 %old, i32 addrspace(1)* %out
4537  ret void
4538}
4539
4540define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) {
4541;
4542;
4543; GFX7LESS-LABEL: umin_i64_constant:
4544; GFX7LESS:       ; %bb.0: ; %entry
4545; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4546; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4547; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
4548; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4549; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
4550; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4551; GFX7LESS-NEXT:    s_cbranch_execz BB24_2
4552; GFX7LESS-NEXT:  ; %bb.1:
4553; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0
4554; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
4555; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
4556; GFX7LESS-NEXT:    s_mov_b32 m0, -1
4557; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4558; GFX7LESS-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
4559; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4560; GFX7LESS-NEXT:  BB24_2:
4561; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
4562; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4563; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
4564; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
4565; GFX7LESS-NEXT:    s_mov_b32 s2, -1
4566; GFX7LESS-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
4567; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
4568; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
4569; GFX7LESS-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
4570; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4571; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
4572; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4573; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
4574; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4575; GFX7LESS-NEXT:    s_endpgm
4576;
4577; GFX8-LABEL: umin_i64_constant:
4578; GFX8:       ; %bb.0: ; %entry
4579; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4580; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4581; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4582; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4583; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
4584; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4585; GFX8-NEXT:    s_cbranch_execz BB24_2
4586; GFX8-NEXT:  ; %bb.1:
4587; GFX8-NEXT:    v_mov_b32_e32 v0, 5
4588; GFX8-NEXT:    v_mov_b32_e32 v2, 0
4589; GFX8-NEXT:    v_mov_b32_e32 v1, 0
4590; GFX8-NEXT:    s_mov_b32 m0, -1
4591; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4592; GFX8-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
4593; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4594; GFX8-NEXT:  BB24_2:
4595; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
4596; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4597; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
4598; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
4599; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
4600; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
4601; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
4602; GFX8-NEXT:    v_mov_b32_e32 v2, s5
4603; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4604; GFX8-NEXT:    v_mov_b32_e32 v2, s4
4605; GFX8-NEXT:    s_mov_b32 s2, -1
4606; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4607; GFX8-NEXT:    s_mov_b32 s3, 0xf000
4608; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4609; GFX8-NEXT:    s_endpgm
4610;
4611; GFX9-LABEL: umin_i64_constant:
4612; GFX9:       ; %bb.0: ; %entry
4613; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4614; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4615; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4616; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4617; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
4618; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4619; GFX9-NEXT:    s_cbranch_execz BB24_2
4620; GFX9-NEXT:  ; %bb.1:
4621; GFX9-NEXT:    v_mov_b32_e32 v0, 5
4622; GFX9-NEXT:    v_mov_b32_e32 v1, 0
4623; GFX9-NEXT:    v_mov_b32_e32 v2, 0
4624; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4625; GFX9-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
4626; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4627; GFX9-NEXT:  BB24_2:
4628; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
4629; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4630; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
4631; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
4632; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
4633; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
4634; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
4635; GFX9-NEXT:    v_mov_b32_e32 v2, s5
4636; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4637; GFX9-NEXT:    v_mov_b32_e32 v2, s4
4638; GFX9-NEXT:    s_mov_b32 s2, -1
4639; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4640; GFX9-NEXT:    s_mov_b32 s3, 0xf000
4641; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4642; GFX9-NEXT:    s_endpgm
4643;
4644; GFX1064-LABEL: umin_i64_constant:
4645; GFX1064:       ; %bb.0: ; %entry
4646; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4647; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4648; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4649; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4650; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
4651; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4652; GFX1064-NEXT:    s_cbranch_execz BB24_2
4653; GFX1064-NEXT:  ; %bb.1:
4654; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
4655; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
4656; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
4657; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4658; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
4659; GFX1064-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
4660; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4661; GFX1064-NEXT:    buffer_gl0_inv
4662; GFX1064-NEXT:  BB24_2:
4663; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
4664; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
4665; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
4666; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
4667; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
4668; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
4669; GFX1064-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
4670; GFX1064-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
4671; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
4672; GFX1064-NEXT:    s_mov_b32 s2, -1
4673; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
4674; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4675; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4676; GFX1064-NEXT:    s_endpgm
4677;
4678; GFX1032-LABEL: umin_i64_constant:
4679; GFX1032:       ; %bb.0: ; %entry
4680; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4681; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4682; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4683; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
4684; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
4685; GFX1032-NEXT:    s_cbranch_execz BB24_2
4686; GFX1032-NEXT:  ; %bb.1:
4687; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
4688; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
4689; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
4690; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4691; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
4692; GFX1032-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
4693; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4694; GFX1032-NEXT:    buffer_gl0_inv
4695; GFX1032-NEXT:  BB24_2:
4696; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
4697; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
4698; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
4699; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
4700; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
4701; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc_lo
4702; GFX1032-NEXT:    v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1]
4703; GFX1032-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
4704; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
4705; GFX1032-NEXT:    s_mov_b32 s2, -1
4706; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
4707; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4708; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4709; GFX1032-NEXT:    s_endpgm
4710entry:
4711  %old = atomicrmw umin i64 addrspace(3)* @local_var64, i64 5 acq_rel
4712  store i64 %old, i64 addrspace(1)* %out
4713  ret void
4714}
4715