1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-- -amdgpu-atomic-optimizations=true -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX7 %s
3; RUN: llc  -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX8 %s
4; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX9 %s
5; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
6; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
7
8declare i1 @llvm.amdgcn.wqm.vote(i1)
9declare i32 @llvm.amdgcn.raw.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32 immarg)
10declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32 immarg)
11
12; Show what the atomic optimization pass will do for raw buffers.
13
14define amdgpu_ps void @add_i32_constant(<4 x i32> inreg %out, <4 x i32> inreg %inout) {
15; GFX7-LABEL: add_i32_constant:
16; GFX7:       ; %bb.0: ; %entry
17; GFX7-NEXT:    s_mov_b64 s[10:11], exec
18; GFX7-NEXT:    ; implicit-def: $vgpr0
19; GFX7-NEXT:    s_and_saveexec_b64 s[8:9], s[10:11]
20; GFX7-NEXT:    s_cbranch_execz BB0_4
21; GFX7-NEXT:  ; %bb.1:
22; GFX7-NEXT:    s_mov_b64 s[12:13], exec
23; GFX7-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s12, 0
24; GFX7-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s13, v0
25; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
26; GFX7-NEXT:    ; implicit-def: $vgpr1
27; GFX7-NEXT:    s_and_saveexec_b64 s[10:11], vcc
28; GFX7-NEXT:    s_cbranch_execz BB0_3
29; GFX7-NEXT:  ; %bb.2:
30; GFX7-NEXT:    s_bcnt1_i32_b64 s12, s[12:13]
31; GFX7-NEXT:    s_mul_i32 s12, s12, 5
32; GFX7-NEXT:    v_mov_b32_e32 v1, s12
33; GFX7-NEXT:    buffer_atomic_add v1, off, s[4:7], 0 glc
34; GFX7-NEXT:  BB0_3:
35; GFX7-NEXT:    s_or_b64 exec, exec, s[10:11]
36; GFX7-NEXT:    s_waitcnt vmcnt(0)
37; GFX7-NEXT:    v_readfirstlane_b32 s4, v1
38; GFX7-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
39; GFX7-NEXT:  BB0_4: ; %Flow
40; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
41; GFX7-NEXT:    s_wqm_b64 s[4:5], -1
42; GFX7-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
43; GFX7-NEXT:    s_cbranch_vccnz BB0_6
44; GFX7-NEXT:  ; %bb.5: ; %if
45; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
46; GFX7-NEXT:  BB0_6: ; %UnifiedReturnBlock
47; GFX7-NEXT:    s_endpgm
48;
49; GFX89-LABEL: add_i32_constant:
50; GFX89:       ; %bb.0: ; %entry
51; GFX89-NEXT:    s_mov_b64 s[10:11], exec
52; GFX89-NEXT:    ; implicit-def: $vgpr0
53; GFX89-NEXT:    s_and_saveexec_b64 s[8:9], s[10:11]
54; GFX89-NEXT:    s_cbranch_execz BB0_4
55; GFX89-NEXT:  ; %bb.1:
56; GFX89-NEXT:    s_mov_b64 s[12:13], exec
57; GFX89-NEXT:    v_mbcnt_lo_u32_b32 v0, s12, 0
58; GFX89-NEXT:    v_mbcnt_hi_u32_b32 v0, s13, v0
59; GFX89-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
60; GFX89-NEXT:    ; implicit-def: $vgpr1
61; GFX89-NEXT:    s_and_saveexec_b64 s[10:11], vcc
62; GFX89-NEXT:    s_cbranch_execz BB0_3
63; GFX89-NEXT:  ; %bb.2:
64; GFX89-NEXT:    s_bcnt1_i32_b64 s12, s[12:13]
65; GFX89-NEXT:    s_mul_i32 s12, s12, 5
66; GFX89-NEXT:    v_mov_b32_e32 v1, s12
67; GFX89-NEXT:    buffer_atomic_add v1, off, s[4:7], 0 glc
68; GFX89-NEXT:  BB0_3:
69; GFX89-NEXT:    s_or_b64 exec, exec, s[10:11]
70; GFX89-NEXT:    s_waitcnt vmcnt(0)
71; GFX89-NEXT:    v_readfirstlane_b32 s4, v1
72; GFX89-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
73; GFX89-NEXT:  BB0_4: ; %Flow
74; GFX89-NEXT:    s_or_b64 exec, exec, s[8:9]
75; GFX89-NEXT:    s_wqm_b64 s[4:5], -1
76; GFX89-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
77; GFX89-NEXT:    s_cbranch_vccnz BB0_6
78; GFX89-NEXT:  ; %bb.5: ; %if
79; GFX89-NEXT:    buffer_store_dword v0, off, s[0:3], 0
80; GFX89-NEXT:  BB0_6: ; %UnifiedReturnBlock
81; GFX89-NEXT:    s_endpgm
82;
83; GFX1064-LABEL: add_i32_constant:
84; GFX1064:       ; %bb.0: ; %entry
85; GFX1064-NEXT:    s_mov_b64 s[10:11], exec
86; GFX1064-NEXT:    ; implicit-def: $vgpr0
87; GFX1064-NEXT:    s_and_saveexec_b64 s[8:9], s[10:11]
88; GFX1064-NEXT:    s_cbranch_execz BB0_4
89; GFX1064-NEXT:  ; %bb.1:
90; GFX1064-NEXT:    s_mov_b64 s[12:13], exec
91; GFX1064-NEXT:    ; implicit-def: $vgpr1
92; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s12, 0
93; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s13, v0
94; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
95; GFX1064-NEXT:    s_and_saveexec_b64 s[10:11], vcc
96; GFX1064-NEXT:    s_cbranch_execz BB0_3
97; GFX1064-NEXT:  ; %bb.2:
98; GFX1064-NEXT:    s_bcnt1_i32_b64 s12, s[12:13]
99; GFX1064-NEXT:    s_mul_i32 s12, s12, 5
100; GFX1064-NEXT:    v_mov_b32_e32 v1, s12
101; GFX1064-NEXT:    buffer_atomic_add v1, off, s[4:7], 0 glc
102; GFX1064-NEXT:  BB0_3:
103; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
104; GFX1064-NEXT:    s_or_b64 exec, exec, s[10:11]
105; GFX1064-NEXT:    s_waitcnt vmcnt(0)
106; GFX1064-NEXT:    v_readfirstlane_b32 s4, v1
107; GFX1064-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
108; GFX1064-NEXT:  BB0_4: ; %Flow
109; GFX1064-NEXT:    s_or_b64 exec, exec, s[8:9]
110; GFX1064-NEXT:    s_wqm_b64 s[4:5], -1
111; GFX1064-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
112; GFX1064-NEXT:    s_cbranch_vccnz BB0_6
113; GFX1064-NEXT:  ; %bb.5: ; %if
114; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
115; GFX1064-NEXT:  BB0_6: ; %UnifiedReturnBlock
116; GFX1064-NEXT:    s_endpgm
117;
118; GFX1032-LABEL: add_i32_constant:
119; GFX1032:       ; %bb.0: ; %entry
120; GFX1032-NEXT:    s_mov_b32 s9, exec_lo
121; GFX1032-NEXT:    ; implicit-def: $vgpr0
122; GFX1032-NEXT:    s_and_saveexec_b32 s8, s9
123; GFX1032-NEXT:    s_cbranch_execz BB0_4
124; GFX1032-NEXT:  ; %bb.1:
125; GFX1032-NEXT:    s_mov_b32 s10, exec_lo
126; GFX1032-NEXT:    ; implicit-def: $vgpr1
127; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s10, 0
128; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
129; GFX1032-NEXT:    s_and_saveexec_b32 s9, vcc_lo
130; GFX1032-NEXT:    s_cbranch_execz BB0_3
131; GFX1032-NEXT:  ; %bb.2:
132; GFX1032-NEXT:    s_bcnt1_i32_b32 s10, s10
133; GFX1032-NEXT:    s_mul_i32 s10, s10, 5
134; GFX1032-NEXT:    v_mov_b32_e32 v1, s10
135; GFX1032-NEXT:    buffer_atomic_add v1, off, s[4:7], 0 glc
136; GFX1032-NEXT:  BB0_3:
137; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
138; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s9
139; GFX1032-NEXT:    s_waitcnt vmcnt(0)
140; GFX1032-NEXT:    v_readfirstlane_b32 s4, v1
141; GFX1032-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
142; GFX1032-NEXT:  BB0_4: ; %Flow
143; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s8
144; GFX1032-NEXT:    s_wqm_b32 s4, -1
145; GFX1032-NEXT:    s_andn2_b32 vcc_lo, exec_lo, s4
146; GFX1032-NEXT:    s_cbranch_vccnz BB0_6
147; GFX1032-NEXT:  ; %bb.5: ; %if
148; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
149; GFX1032-NEXT:  BB0_6: ; %UnifiedReturnBlock
150; GFX1032-NEXT:    s_endpgm
151entry:
152  %cond1 = call i1 @llvm.amdgcn.wqm.vote(i1 true)
153  %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0)
154  %cond2 = call i1 @llvm.amdgcn.wqm.vote(i1 true)
155  %cond = and i1 %cond1, %cond2
156  br i1 %cond, label %if, label %else
157if:
158  %bitcast = bitcast i32 %old to float
159  call void @llvm.amdgcn.raw.buffer.store.f32(float %bitcast, <4 x i32> %out, i32 0, i32 0, i32 0)
160  ret void
161else:
162  ret void
163}
164
165define amdgpu_ps void @add_i32_varying(<4 x i32> inreg %out, <4 x i32> inreg %inout, i32 %val) {
166; GFX7-LABEL: add_i32_varying:
167; GFX7:       ; %bb.0: ; %entry
168; GFX7-NEXT:    s_wqm_b64 s[8:9], -1
169; GFX7-NEXT:    buffer_atomic_add v0, off, s[4:7], 0 glc
170; GFX7-NEXT:    s_andn2_b64 vcc, exec, s[8:9]
171; GFX7-NEXT:    s_cbranch_vccnz BB1_2
172; GFX7-NEXT:  ; %bb.1: ; %if
173; GFX7-NEXT:    s_waitcnt vmcnt(0)
174; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
175; GFX7-NEXT:  BB1_2: ; %else
176; GFX7-NEXT:    s_endpgm
177;
178; GFX8-LABEL: add_i32_varying:
179; GFX8:       ; %bb.0: ; %entry
180; GFX8-NEXT:    s_mov_b64 s[8:9], exec
181; GFX8-NEXT:    s_mov_b64 s[10:11], s[8:9]
182; GFX8-NEXT:    v_mov_b32_e32 v2, v0
183; GFX8-NEXT:    ; implicit-def: $vgpr0
184; GFX8-NEXT:    s_and_saveexec_b64 s[8:9], s[10:11]
185; GFX8-NEXT:    s_cbranch_execz BB1_4
186; GFX8-NEXT:  ; %bb.1:
187; GFX8-NEXT:    s_or_saveexec_b64 s[10:11], -1
188; GFX8-NEXT:    v_mov_b32_e32 v1, 0
189; GFX8-NEXT:    s_mov_b64 exec, s[10:11]
190; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
191; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
192; GFX8-NEXT:    s_not_b64 exec, exec
193; GFX8-NEXT:    v_mov_b32_e32 v2, 0
194; GFX8-NEXT:    s_not_b64 exec, exec
195; GFX8-NEXT:    s_or_saveexec_b64 s[10:11], -1
196; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
197; GFX8-NEXT:    s_nop 1
198; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
199; GFX8-NEXT:    s_nop 1
200; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
201; GFX8-NEXT:    s_nop 1
202; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
203; GFX8-NEXT:    s_nop 1
204; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
205; GFX8-NEXT:    s_nop 1
206; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
207; GFX8-NEXT:    v_readlane_b32 s12, v2, 63
208; GFX8-NEXT:    s_nop 0
209; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
210; GFX8-NEXT:    s_mov_b64 exec, s[10:11]
211; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
212; GFX8-NEXT:    ; implicit-def: $vgpr0
213; GFX8-NEXT:    s_and_saveexec_b64 s[10:11], vcc
214; GFX8-NEXT:    s_cbranch_execz BB1_3
215; GFX8-NEXT:  ; %bb.2:
216; GFX8-NEXT:    v_mov_b32_e32 v0, s12
217; GFX8-NEXT:    buffer_atomic_add v0, off, s[4:7], 0 glc
218; GFX8-NEXT:  BB1_3:
219; GFX8-NEXT:    s_or_b64 exec, exec, s[10:11]
220; GFX8-NEXT:    s_waitcnt vmcnt(0)
221; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
222; GFX8-NEXT:    v_mov_b32_e32 v0, v1
223; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v0
224; GFX8-NEXT:  BB1_4: ; %Flow
225; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
226; GFX8-NEXT:    s_wqm_b64 s[4:5], -1
227; GFX8-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
228; GFX8-NEXT:    s_cbranch_vccnz BB1_6
229; GFX8-NEXT:  ; %bb.5: ; %if
230; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
231; GFX8-NEXT:  BB1_6: ; %UnifiedReturnBlock
232; GFX8-NEXT:    s_endpgm
233;
234; GFX9-LABEL: add_i32_varying:
235; GFX9:       ; %bb.0: ; %entry
236; GFX9-NEXT:    s_mov_b64 s[8:9], exec
237; GFX9-NEXT:    s_mov_b64 s[10:11], s[8:9]
238; GFX9-NEXT:    v_mov_b32_e32 v2, v0
239; GFX9-NEXT:    ; implicit-def: $vgpr0
240; GFX9-NEXT:    s_and_saveexec_b64 s[8:9], s[10:11]
241; GFX9-NEXT:    s_cbranch_execz BB1_4
242; GFX9-NEXT:  ; %bb.1:
243; GFX9-NEXT:    s_or_saveexec_b64 s[10:11], -1
244; GFX9-NEXT:    v_mov_b32_e32 v1, 0
245; GFX9-NEXT:    s_mov_b64 exec, s[10:11]
246; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
247; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
248; GFX9-NEXT:    s_not_b64 exec, exec
249; GFX9-NEXT:    v_mov_b32_e32 v2, 0
250; GFX9-NEXT:    s_not_b64 exec, exec
251; GFX9-NEXT:    s_or_saveexec_b64 s[10:11], -1
252; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
253; GFX9-NEXT:    s_nop 1
254; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
255; GFX9-NEXT:    s_nop 1
256; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
257; GFX9-NEXT:    s_nop 1
258; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
259; GFX9-NEXT:    s_nop 1
260; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
261; GFX9-NEXT:    s_nop 1
262; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
263; GFX9-NEXT:    v_readlane_b32 s12, v2, 63
264; GFX9-NEXT:    s_nop 0
265; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
266; GFX9-NEXT:    s_mov_b64 exec, s[10:11]
267; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
268; GFX9-NEXT:    ; implicit-def: $vgpr0
269; GFX9-NEXT:    s_and_saveexec_b64 s[10:11], vcc
270; GFX9-NEXT:    s_cbranch_execz BB1_3
271; GFX9-NEXT:  ; %bb.2:
272; GFX9-NEXT:    v_mov_b32_e32 v0, s12
273; GFX9-NEXT:    buffer_atomic_add v0, off, s[4:7], 0 glc
274; GFX9-NEXT:  BB1_3:
275; GFX9-NEXT:    s_or_b64 exec, exec, s[10:11]
276; GFX9-NEXT:    s_waitcnt vmcnt(0)
277; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
278; GFX9-NEXT:    v_mov_b32_e32 v0, v1
279; GFX9-NEXT:    v_add_u32_e32 v0, s4, v0
280; GFX9-NEXT:  BB1_4: ; %Flow
281; GFX9-NEXT:    s_or_b64 exec, exec, s[8:9]
282; GFX9-NEXT:    s_wqm_b64 s[4:5], -1
283; GFX9-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
284; GFX9-NEXT:    s_cbranch_vccnz BB1_6
285; GFX9-NEXT:  ; %bb.5: ; %if
286; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
287; GFX9-NEXT:  BB1_6: ; %UnifiedReturnBlock
288; GFX9-NEXT:    s_endpgm
289;
290; GFX1064-LABEL: add_i32_varying:
291; GFX1064:       ; %bb.0: ; %entry
292; GFX1064-NEXT:    s_mov_b64 s[8:9], exec
293; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
294; GFX1064-NEXT:    s_mov_b64 s[10:11], s[8:9]
295; GFX1064-NEXT:    ; implicit-def: $vgpr0
296; GFX1064-NEXT:    s_and_saveexec_b64 s[8:9], s[10:11]
297; GFX1064-NEXT:    s_cbranch_execz BB1_4
298; GFX1064-NEXT:  ; %bb.1:
299; GFX1064-NEXT:    s_not_b64 exec, exec
300; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
301; GFX1064-NEXT:    s_not_b64 exec, exec
302; GFX1064-NEXT:    s_or_saveexec_b64 s[10:11], -1
303; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
304; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
305; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
306; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
307; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
308; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
309; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
310; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
311; GFX1064-NEXT:    v_readlane_b32 s12, v1, 31
312; GFX1064-NEXT:    v_mov_b32_e32 v2, s12
313; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
314; GFX1064-NEXT:    v_readlane_b32 s12, v1, 15
315; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
316; GFX1064-NEXT:    v_readlane_b32 s13, v1, 31
317; GFX1064-NEXT:    v_writelane_b32 v3, s12, 16
318; GFX1064-NEXT:    s_mov_b64 exec, s[10:11]
319; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
320; GFX1064-NEXT:    s_or_saveexec_b64 s[10:11], -1
321; GFX1064-NEXT:    v_readlane_b32 s12, v1, 63
322; GFX1064-NEXT:    v_readlane_b32 s14, v1, 47
323; GFX1064-NEXT:    v_writelane_b32 v3, s13, 32
324; GFX1064-NEXT:    s_mov_b64 exec, s[10:11]
325; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
326; GFX1064-NEXT:    s_or_saveexec_b64 s[10:11], -1
327; GFX1064-NEXT:    v_writelane_b32 v3, s14, 48
328; GFX1064-NEXT:    s_mov_b64 exec, s[10:11]
329; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
330; GFX1064-NEXT:    ; implicit-def: $vgpr0
331; GFX1064-NEXT:    s_and_saveexec_b64 s[10:11], vcc
332; GFX1064-NEXT:    s_cbranch_execz BB1_3
333; GFX1064-NEXT:  ; %bb.2:
334; GFX1064-NEXT:    v_mov_b32_e32 v0, s12
335; GFX1064-NEXT:    buffer_atomic_add v0, off, s[4:7], 0 glc
336; GFX1064-NEXT:  BB1_3:
337; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
338; GFX1064-NEXT:    s_or_b64 exec, exec, s[10:11]
339; GFX1064-NEXT:    s_waitcnt vmcnt(0)
340; GFX1064-NEXT:    v_readfirstlane_b32 s4, v0
341; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
342; GFX1064-NEXT:    v_add_nc_u32_e32 v0, s4, v0
343; GFX1064-NEXT:  BB1_4: ; %Flow
344; GFX1064-NEXT:    s_or_b64 exec, exec, s[8:9]
345; GFX1064-NEXT:    s_wqm_b64 s[4:5], -1
346; GFX1064-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
347; GFX1064-NEXT:    s_cbranch_vccnz BB1_6
348; GFX1064-NEXT:  ; %bb.5: ; %if
349; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
350; GFX1064-NEXT:  BB1_6: ; %UnifiedReturnBlock
351; GFX1064-NEXT:    s_endpgm
352;
353; GFX1032-LABEL: add_i32_varying:
354; GFX1032:       ; %bb.0: ; %entry
355; GFX1032-NEXT:    s_mov_b32 s8, exec_lo
356; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
357; GFX1032-NEXT:    s_mov_b32 s9, s8
358; GFX1032-NEXT:    ; implicit-def: $vgpr0
359; GFX1032-NEXT:    s_and_saveexec_b32 s8, s9
360; GFX1032-NEXT:    s_cbranch_execz BB1_4
361; GFX1032-NEXT:  ; %bb.1:
362; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
363; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
364; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
365; GFX1032-NEXT:    s_or_saveexec_b32 s9, -1
366; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
367; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
368; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
369; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
370; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
371; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
372; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
373; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
374; GFX1032-NEXT:    v_readlane_b32 s11, v1, 31
375; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
376; GFX1032-NEXT:    v_readlane_b32 s10, v1, 15
377; GFX1032-NEXT:    s_mov_b32 exec_lo, s9
378; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
379; GFX1032-NEXT:    s_or_saveexec_b32 s9, -1
380; GFX1032-NEXT:    v_writelane_b32 v3, s10, 16
381; GFX1032-NEXT:    s_mov_b32 exec_lo, s9
382; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
383; GFX1032-NEXT:    ; implicit-def: $vgpr0
384; GFX1032-NEXT:    s_and_saveexec_b32 s9, vcc_lo
385; GFX1032-NEXT:    s_cbranch_execz BB1_3
386; GFX1032-NEXT:  ; %bb.2:
387; GFX1032-NEXT:    v_mov_b32_e32 v0, s11
388; GFX1032-NEXT:    s_mov_b32 s10, s11
389; GFX1032-NEXT:    buffer_atomic_add v0, off, s[4:7], 0 glc
390; GFX1032-NEXT:  BB1_3:
391; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
392; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s9
393; GFX1032-NEXT:    s_waitcnt vmcnt(0)
394; GFX1032-NEXT:    v_readfirstlane_b32 s4, v0
395; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
396; GFX1032-NEXT:    v_add_nc_u32_e32 v0, s4, v0
397; GFX1032-NEXT:  BB1_4: ; %Flow
398; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s8
399; GFX1032-NEXT:    s_wqm_b32 s4, -1
400; GFX1032-NEXT:    s_andn2_b32 vcc_lo, exec_lo, s4
401; GFX1032-NEXT:    s_cbranch_vccnz BB1_6
402; GFX1032-NEXT:  ; %bb.5: ; %if
403; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
404; GFX1032-NEXT:  BB1_6: ; %UnifiedReturnBlock
405; GFX1032-NEXT:    s_endpgm
406entry:
407  %cond1 = call i1 @llvm.amdgcn.wqm.vote(i1 true)
408  %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 %val, <4 x i32> %inout, i32 0, i32 0, i32 0)
409  %cond2 = call i1 @llvm.amdgcn.wqm.vote(i1 true)
410  %cond = and i1 %cond1, %cond2
411  br i1 %cond, label %if, label %else
412if:
413  %bitcast = bitcast i32 %old to float
414  call void @llvm.amdgcn.raw.buffer.store.f32(float %bitcast, <4 x i32> %out, i32 0, i32 0, i32 0)
415  ret void
416else:
417  ret void
418}
419