1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX6 %s
3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
4; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
5; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
6; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
7
8define amdgpu_kernel void @local_volatile_load_0(
9; GFX6-LABEL: local_volatile_load_0:
10; GFX6:       ; %bb.0: ; %entry
11; GFX6-NEXT:    s_load_dword s4, s[0:1], 0x9
12; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
13; GFX6-NEXT:    s_mov_b32 m0, -1
14; GFX6-NEXT:    s_mov_b32 s3, 0xf000
15; GFX6-NEXT:    s_mov_b32 s2, -1
16; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
17; GFX6-NEXT:    v_mov_b32_e32 v0, s4
18; GFX6-NEXT:    ds_read_b32 v0, v0
19; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
20; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
21; GFX6-NEXT:    s_endpgm
22;
23; GFX7-LABEL: local_volatile_load_0:
24; GFX7:       ; %bb.0: ; %entry
25; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
26; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
27; GFX7-NEXT:    s_mov_b32 m0, -1
28; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
29; GFX7-NEXT:    v_mov_b32_e32 v0, s2
30; GFX7-NEXT:    ds_read_b32 v2, v0
31; GFX7-NEXT:    v_mov_b32_e32 v0, s0
32; GFX7-NEXT:    v_mov_b32_e32 v1, s1
33; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
34; GFX7-NEXT:    flat_store_dword v[0:1], v2
35; GFX7-NEXT:    s_endpgm
36;
37; GFX10-WGP-LABEL: local_volatile_load_0:
38; GFX10-WGP:       ; %bb.0: ; %entry
39; GFX10-WGP-NEXT:    s_clause 0x1
40; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
41; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
42; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, 0
43; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
44; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
45; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
46; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
47; GFX10-WGP-NEXT:    global_store_dword v1, v0, s[0:1]
48; GFX10-WGP-NEXT:    s_endpgm
49;
50; GFX10-CU-LABEL: local_volatile_load_0:
51; GFX10-CU:       ; %bb.0: ; %entry
52; GFX10-CU-NEXT:    s_clause 0x1
53; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
54; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
55; GFX10-CU-NEXT:    v_mov_b32_e32 v1, 0
56; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
57; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
58; GFX10-CU-NEXT:    ds_read_b32 v0, v0
59; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
60; GFX10-CU-NEXT:    global_store_dword v1, v0, s[0:1]
61; GFX10-CU-NEXT:    s_endpgm
62;
63; SKIP-CACHE-INV-LABEL: local_volatile_load_0:
64; SKIP-CACHE-INV:       ; %bb.0: ; %entry
65; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x9
66; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
67; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
68; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
69; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
70; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
71; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
72; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
73; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
74; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
75; SKIP-CACHE-INV-NEXT:    s_endpgm
76    i32 addrspace(3)* %in, i32 addrspace(1)* %out) {
77entry:
78  %val = load volatile i32, i32 addrspace(3)* %in, align 4
79  store i32 %val, i32 addrspace(1)* %out
80  ret void
81}
82
83define amdgpu_kernel void @local_volatile_load_1(
84; GFX6-LABEL: local_volatile_load_1:
85; GFX6:       ; %bb.0: ; %entry
86; GFX6-NEXT:    s_load_dword s4, s[0:1], 0x9
87; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
88; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
89; GFX6-NEXT:    s_mov_b32 m0, -1
90; GFX6-NEXT:    s_mov_b32 s3, 0xf000
91; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
92; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
93; GFX6-NEXT:    ds_read_b32 v0, v0
94; GFX6-NEXT:    s_mov_b32 s2, -1
95; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
96; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
97; GFX6-NEXT:    s_endpgm
98;
99; GFX7-LABEL: local_volatile_load_1:
100; GFX7:       ; %bb.0: ; %entry
101; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
102; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
103; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
104; GFX7-NEXT:    s_mov_b32 m0, -1
105; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
106; GFX7-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
107; GFX7-NEXT:    ds_read_b32 v2, v0
108; GFX7-NEXT:    v_mov_b32_e32 v0, s0
109; GFX7-NEXT:    v_mov_b32_e32 v1, s1
110; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
111; GFX7-NEXT:    flat_store_dword v[0:1], v2
112; GFX7-NEXT:    s_endpgm
113;
114; GFX10-WGP-LABEL: local_volatile_load_1:
115; GFX10-WGP:       ; %bb.0: ; %entry
116; GFX10-WGP-NEXT:    s_clause 0x1
117; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
118; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
119; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, 0
120; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
121; GFX10-WGP-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
122; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
123; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
124; GFX10-WGP-NEXT:    global_store_dword v1, v0, s[0:1]
125; GFX10-WGP-NEXT:    s_endpgm
126;
127; GFX10-CU-LABEL: local_volatile_load_1:
128; GFX10-CU:       ; %bb.0: ; %entry
129; GFX10-CU-NEXT:    s_clause 0x1
130; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
131; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
132; GFX10-CU-NEXT:    v_mov_b32_e32 v1, 0
133; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
134; GFX10-CU-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
135; GFX10-CU-NEXT:    ds_read_b32 v0, v0
136; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
137; GFX10-CU-NEXT:    global_store_dword v1, v0, s[0:1]
138; GFX10-CU-NEXT:    s_endpgm
139;
140; SKIP-CACHE-INV-LABEL: local_volatile_load_1:
141; SKIP-CACHE-INV:       ; %bb.0: ; %entry
142; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x9
143; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
144; SKIP-CACHE-INV-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
145; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
146; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
147; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
148; SKIP-CACHE-INV-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
149; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
150; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
151; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
152; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
153; SKIP-CACHE-INV-NEXT:    s_endpgm
154    i32 addrspace(3)* %in, i32 addrspace(1)* %out) {
155entry:
156  %tid = call i32 @llvm.amdgcn.workitem.id.x()
157  %val.gep = getelementptr inbounds i32, i32 addrspace(3)* %in, i32 %tid
158  %val = load volatile i32, i32 addrspace(3)* %val.gep, align 4
159  store i32 %val, i32 addrspace(1)* %out
160  ret void
161}
162
163define amdgpu_kernel void @local_volatile_store_0(
164; GFX6-LABEL: local_volatile_store_0:
165; GFX6:       ; %bb.0: ; %entry
166; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
167; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
168; GFX6-NEXT:    s_mov_b32 m0, -1
169; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
170; GFX6-NEXT:    s_load_dword s1, s[2:3], 0x0
171; GFX6-NEXT:    v_mov_b32_e32 v0, s0
172; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
173; GFX6-NEXT:    v_mov_b32_e32 v1, s1
174; GFX6-NEXT:    ds_write_b32 v0, v1
175; GFX6-NEXT:    s_endpgm
176;
177; GFX7-LABEL: local_volatile_store_0:
178; GFX7:       ; %bb.0: ; %entry
179; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
180; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
181; GFX7-NEXT:    s_mov_b32 m0, -1
182; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
183; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
184; GFX7-NEXT:    v_mov_b32_e32 v0, s2
185; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
186; GFX7-NEXT:    v_mov_b32_e32 v1, s0
187; GFX7-NEXT:    ds_write_b32 v0, v1
188; GFX7-NEXT:    s_endpgm
189;
190; GFX10-WGP-LABEL: local_volatile_store_0:
191; GFX10-WGP:       ; %bb.0: ; %entry
192; GFX10-WGP-NEXT:    s_clause 0x1
193; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
194; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
195; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
196; GFX10-WGP-NEXT:    s_load_dword s0, s[0:1], 0x0
197; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
198; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
199; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
200; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
201; GFX10-WGP-NEXT:    s_endpgm
202;
203; GFX10-CU-LABEL: local_volatile_store_0:
204; GFX10-CU:       ; %bb.0: ; %entry
205; GFX10-CU-NEXT:    s_clause 0x1
206; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
207; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
208; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
209; GFX10-CU-NEXT:    s_load_dword s0, s[0:1], 0x0
210; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
211; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
212; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
213; GFX10-CU-NEXT:    ds_write_b32 v0, v1
214; GFX10-CU-NEXT:    s_endpgm
215;
216; SKIP-CACHE-INV-LABEL: local_volatile_store_0:
217; SKIP-CACHE-INV:       ; %bb.0: ; %entry
218; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
219; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
220; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
221; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
222; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
223; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
224; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
225; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
226; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
227; SKIP-CACHE-INV-NEXT:    s_endpgm
228    i32 addrspace(1)* %in, i32 addrspace(3)* %out) {
229entry:
230  %val = load i32, i32 addrspace(1)* %in, align 4
231  store volatile i32 %val, i32 addrspace(3)* %out
232  ret void
233}
234
235define amdgpu_kernel void @local_volatile_store_1(
236; GFX6-LABEL: local_volatile_store_1:
237; GFX6:       ; %bb.0: ; %entry
238; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
239; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
240; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
241; GFX6-NEXT:    s_mov_b32 m0, -1
242; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
243; GFX6-NEXT:    s_load_dword s1, s[2:3], 0x0
244; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
245; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
246; GFX6-NEXT:    v_mov_b32_e32 v1, s1
247; GFX6-NEXT:    ds_write_b32 v0, v1
248; GFX6-NEXT:    s_endpgm
249;
250; GFX7-LABEL: local_volatile_store_1:
251; GFX7:       ; %bb.0: ; %entry
252; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
253; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
254; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
255; GFX7-NEXT:    s_mov_b32 m0, -1
256; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
257; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
258; GFX7-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
259; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
260; GFX7-NEXT:    v_mov_b32_e32 v1, s0
261; GFX7-NEXT:    ds_write_b32 v0, v1
262; GFX7-NEXT:    s_endpgm
263;
264; GFX10-WGP-LABEL: local_volatile_store_1:
265; GFX10-WGP:       ; %bb.0: ; %entry
266; GFX10-WGP-NEXT:    s_clause 0x1
267; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
268; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
269; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
270; GFX10-WGP-NEXT:    s_load_dword s0, s[0:1], 0x0
271; GFX10-WGP-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
272; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
273; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
274; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
275; GFX10-WGP-NEXT:    s_endpgm
276;
277; GFX10-CU-LABEL: local_volatile_store_1:
278; GFX10-CU:       ; %bb.0: ; %entry
279; GFX10-CU-NEXT:    s_clause 0x1
280; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
281; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
282; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
283; GFX10-CU-NEXT:    s_load_dword s0, s[0:1], 0x0
284; GFX10-CU-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
285; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
286; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
287; GFX10-CU-NEXT:    ds_write_b32 v0, v1
288; GFX10-CU-NEXT:    s_endpgm
289;
290; SKIP-CACHE-INV-LABEL: local_volatile_store_1:
291; SKIP-CACHE-INV:       ; %bb.0: ; %entry
292; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
293; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
294; SKIP-CACHE-INV-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
295; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
296; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
297; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
298; SKIP-CACHE-INV-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
299; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
300; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
301; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
302; SKIP-CACHE-INV-NEXT:    s_endpgm
303    i32 addrspace(1)* %in, i32 addrspace(3)* %out) {
304entry:
305  %tid = call i32 @llvm.amdgcn.workitem.id.x()
306  %val = load i32, i32 addrspace(1)* %in, align 4
307  %out.gep = getelementptr inbounds i32, i32 addrspace(3)* %out, i32 %tid
308  store volatile i32 %val, i32 addrspace(3)* %out.gep
309  ret void
310}
311
312define amdgpu_kernel void @local_volatile_workgroup_acquire_load(
313; GFX6-LABEL: local_volatile_workgroup_acquire_load:
314; GFX6:       ; %bb.0: ; %entry
315; GFX6-NEXT:    s_load_dword s2, s[0:1], 0x9
316; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xa
317; GFX6-NEXT:    s_mov_b32 m0, -1
318; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
319; GFX6-NEXT:    v_mov_b32_e32 v0, s2
320; GFX6-NEXT:    ds_read_b32 v0, v0
321; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
322; GFX6-NEXT:    v_mov_b32_e32 v1, s0
323; GFX6-NEXT:    ds_write_b32 v1, v0
324; GFX6-NEXT:    s_endpgm
325;
326; GFX7-LABEL: local_volatile_workgroup_acquire_load:
327; GFX7:       ; %bb.0: ; %entry
328; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
329; GFX7-NEXT:    s_mov_b32 m0, -1
330; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
331; GFX7-NEXT:    v_mov_b32_e32 v0, s0
332; GFX7-NEXT:    ds_read_b32 v0, v0
333; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
334; GFX7-NEXT:    v_mov_b32_e32 v1, s1
335; GFX7-NEXT:    ds_write_b32 v1, v0
336; GFX7-NEXT:    s_endpgm
337;
338; GFX10-WGP-LABEL: local_volatile_workgroup_acquire_load:
339; GFX10-WGP:       ; %bb.0: ; %entry
340; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
341; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
342; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
343; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
344; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
345; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
346; GFX10-WGP-NEXT:    buffer_gl0_inv
347; GFX10-WGP-NEXT:    ds_write_b32 v1, v0
348; GFX10-WGP-NEXT:    s_endpgm
349;
350; GFX10-CU-LABEL: local_volatile_workgroup_acquire_load:
351; GFX10-CU:       ; %bb.0: ; %entry
352; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
353; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
354; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
355; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
356; GFX10-CU-NEXT:    ds_read_b32 v0, v0
357; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
358; GFX10-CU-NEXT:    ds_write_b32 v1, v0
359; GFX10-CU-NEXT:    s_endpgm
360;
361; SKIP-CACHE-INV-LABEL: local_volatile_workgroup_acquire_load:
362; SKIP-CACHE-INV:       ; %bb.0: ; %entry
363; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
364; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
365; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
366; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
367; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
368; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
369; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
370; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
371; SKIP-CACHE-INV-NEXT:    s_endpgm
372    i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
373entry:
374  %val = load atomic volatile i32, i32 addrspace(3)* %in syncscope("workgroup") acquire, align 4
375  store i32 %val, i32 addrspace(3)* %out
376  ret void
377}
378
379define amdgpu_kernel void @local_volatile_workgroup_release_store(
380; GFX6-LABEL: local_volatile_workgroup_release_store:
381; GFX6:       ; %bb.0: ; %entry
382; GFX6-NEXT:    s_load_dword s2, s[0:1], 0x9
383; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xa
384; GFX6-NEXT:    s_mov_b32 m0, -1
385; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
386; GFX6-NEXT:    v_mov_b32_e32 v1, s2
387; GFX6-NEXT:    v_mov_b32_e32 v0, s0
388; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
389; GFX6-NEXT:    ds_write_b32 v0, v1
390; GFX6-NEXT:    s_endpgm
391;
392; GFX7-LABEL: local_volatile_workgroup_release_store:
393; GFX7:       ; %bb.0: ; %entry
394; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
395; GFX7-NEXT:    s_mov_b32 m0, -1
396; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
397; GFX7-NEXT:    v_mov_b32_e32 v0, s1
398; GFX7-NEXT:    v_mov_b32_e32 v1, s0
399; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
400; GFX7-NEXT:    ds_write_b32 v0, v1
401; GFX7-NEXT:    s_endpgm
402;
403; GFX10-WGP-LABEL: local_volatile_workgroup_release_store:
404; GFX10-WGP:       ; %bb.0: ; %entry
405; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
406; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
407; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s1
408; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
409; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
410; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
411; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
412; GFX10-WGP-NEXT:    s_endpgm
413;
414; GFX10-CU-LABEL: local_volatile_workgroup_release_store:
415; GFX10-CU:       ; %bb.0: ; %entry
416; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
417; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
418; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s1
419; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
420; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
421; GFX10-CU-NEXT:    ds_write_b32 v0, v1
422; GFX10-CU-NEXT:    s_endpgm
423;
424; SKIP-CACHE-INV-LABEL: local_volatile_workgroup_release_store:
425; SKIP-CACHE-INV:       ; %bb.0: ; %entry
426; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
427; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
428; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
429; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
430; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
431; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
432; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
433; SKIP-CACHE-INV-NEXT:    s_endpgm
434   i32 %in, i32 addrspace(3)* %out) {
435entry:
436  store atomic volatile i32 %in, i32 addrspace(3)* %out syncscope("workgroup") release, align 4
437  ret void
438}
439
440declare i32 @llvm.amdgcn.workitem.id.x()
441