1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX6 %s
3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
4; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
5; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
6; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
7
8define amdgpu_kernel void @global_volatile_load_0(
9; GFX6-LABEL: global_volatile_load_0:
10; GFX6:       ; %bb.0: ; %entry
11; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
12; GFX6-NEXT:    s_mov_b32 s3, 0xf000
13; GFX6-NEXT:    s_mov_b32 s2, -1
14; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
15; GFX6-NEXT:    s_mov_b32 s0, s4
16; GFX6-NEXT:    s_mov_b32 s1, s5
17; GFX6-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
18; GFX6-NEXT:    s_waitcnt vmcnt(0)
19; GFX6-NEXT:    s_mov_b32 s4, s6
20; GFX6-NEXT:    s_mov_b32 s5, s7
21; GFX6-NEXT:    s_mov_b32 s6, s2
22; GFX6-NEXT:    s_mov_b32 s7, s3
23; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
24; GFX6-NEXT:    s_endpgm
25;
26; GFX7-LABEL: global_volatile_load_0:
27; GFX7:       ; %bb.0: ; %entry
28; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
29; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
30; GFX7-NEXT:    v_mov_b32_e32 v0, s0
31; GFX7-NEXT:    v_mov_b32_e32 v1, s1
32; GFX7-NEXT:    flat_load_dword v0, v[0:1] glc
33; GFX7-NEXT:    s_waitcnt vmcnt(0)
34; GFX7-NEXT:    v_mov_b32_e32 v2, s2
35; GFX7-NEXT:    v_mov_b32_e32 v3, s3
36; GFX7-NEXT:    flat_store_dword v[2:3], v0
37; GFX7-NEXT:    s_endpgm
38;
39; GFX10-WGP-LABEL: global_volatile_load_0:
40; GFX10-WGP:       ; %bb.0: ; %entry
41; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
42; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
43; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
44; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
45; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
46; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
47; GFX10-WGP-NEXT:    s_endpgm
48;
49; GFX10-CU-LABEL: global_volatile_load_0:
50; GFX10-CU:       ; %bb.0: ; %entry
51; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
52; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
53; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
54; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
55; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
56; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
57; GFX10-CU-NEXT:    s_endpgm
58;
59; SKIP-CACHE-INV-LABEL: global_volatile_load_0:
60; SKIP-CACHE-INV:       ; %bb.0: ; %entry
61; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
62; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
63; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
64; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
65; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
66; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
67; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
68; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
69; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s6
70; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s7
71; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s2
72; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s3
73; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
74; SKIP-CACHE-INV-NEXT:    s_endpgm
75    i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
76entry:
77  %val = load volatile i32, i32 addrspace(1)* %in, align 4
78  store i32 %val, i32 addrspace(1)* %out
79  ret void
80}
81
82define amdgpu_kernel void @global_volatile_load_1(
83; GFX6-LABEL: global_volatile_load_1:
84; GFX6:       ; %bb.0: ; %entry
85; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
86; GFX6-NEXT:    s_mov_b32 s3, 0xf000
87; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
88; GFX6-NEXT:    v_mov_b32_e32 v1, 0
89; GFX6-NEXT:    s_mov_b32 s2, -1
90; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
91; GFX6-NEXT:    s_mov_b32 s0, s6
92; GFX6-NEXT:    s_mov_b32 s1, s7
93; GFX6-NEXT:    s_mov_b32 s6, 0
94; GFX6-NEXT:    s_mov_b32 s7, s3
95; GFX6-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 glc
96; GFX6-NEXT:    s_waitcnt vmcnt(0)
97; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
98; GFX6-NEXT:    s_endpgm
99;
100; GFX7-LABEL: global_volatile_load_1:
101; GFX7:       ; %bb.0: ; %entry
102; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
103; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
104; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
105; GFX7-NEXT:    v_mov_b32_e32 v3, s1
106; GFX7-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
107; GFX7-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
108; GFX7-NEXT:    flat_load_dword v2, v[2:3] glc
109; GFX7-NEXT:    s_waitcnt vmcnt(0)
110; GFX7-NEXT:    v_mov_b32_e32 v0, s2
111; GFX7-NEXT:    v_mov_b32_e32 v1, s3
112; GFX7-NEXT:    flat_store_dword v[0:1], v2
113; GFX7-NEXT:    s_endpgm
114;
115; GFX10-WGP-LABEL: global_volatile_load_1:
116; GFX10-WGP:       ; %bb.0: ; %entry
117; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
118; GFX10-WGP-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
119; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, 0
120; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
121; GFX10-WGP-NEXT:    global_load_dword v0, v0, s[0:1] glc dlc
122; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
123; GFX10-WGP-NEXT:    global_store_dword v1, v0, s[2:3]
124; GFX10-WGP-NEXT:    s_endpgm
125;
126; GFX10-CU-LABEL: global_volatile_load_1:
127; GFX10-CU:       ; %bb.0: ; %entry
128; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
129; GFX10-CU-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
130; GFX10-CU-NEXT:    v_mov_b32_e32 v1, 0
131; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
132; GFX10-CU-NEXT:    global_load_dword v0, v0, s[0:1] glc dlc
133; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
134; GFX10-CU-NEXT:    global_store_dword v1, v0, s[2:3]
135; GFX10-CU-NEXT:    s_endpgm
136;
137; SKIP-CACHE-INV-LABEL: global_volatile_load_1:
138; SKIP-CACHE-INV:       ; %bb.0: ; %entry
139; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
140; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
141; SKIP-CACHE-INV-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
142; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, 0
143; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
144; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
145; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s6
146; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
147; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0
148; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s3
149; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 glc
150; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
151; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
152; SKIP-CACHE-INV-NEXT:    s_endpgm
153    i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
154entry:
155  %tid = call i32 @llvm.amdgcn.workitem.id.x()
156  %val.gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %tid
157  %val = load volatile i32, i32 addrspace(1)* %val.gep, align 4
158  store i32 %val, i32 addrspace(1)* %out
159  ret void
160}
161
162define amdgpu_kernel void @global_volatile_store_0(
163; GFX6-LABEL: global_volatile_store_0:
164; GFX6:       ; %bb.0: ; %entry
165; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
166; GFX6-NEXT:    s_mov_b32 s7, 0xf000
167; GFX6-NEXT:    s_mov_b32 s6, -1
168; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
169; GFX6-NEXT:    s_load_dword s0, s[0:1], 0x0
170; GFX6-NEXT:    s_mov_b32 s4, s2
171; GFX6-NEXT:    s_mov_b32 s5, s3
172; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
173; GFX6-NEXT:    v_mov_b32_e32 v0, s0
174; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
175; GFX6-NEXT:    s_waitcnt vmcnt(0)
176; GFX6-NEXT:    s_endpgm
177;
178; GFX7-LABEL: global_volatile_store_0:
179; GFX7:       ; %bb.0: ; %entry
180; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
181; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
182; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
183; GFX7-NEXT:    v_mov_b32_e32 v0, s2
184; GFX7-NEXT:    v_mov_b32_e32 v1, s3
185; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
186; GFX7-NEXT:    v_mov_b32_e32 v2, s0
187; GFX7-NEXT:    flat_store_dword v[0:1], v2
188; GFX7-NEXT:    s_waitcnt vmcnt(0)
189; GFX7-NEXT:    s_endpgm
190;
191; GFX10-WGP-LABEL: global_volatile_store_0:
192; GFX10-WGP:       ; %bb.0: ; %entry
193; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
194; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
195; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
196; GFX10-WGP-NEXT:    s_load_dword s0, s[0:1], 0x0
197; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
198; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
199; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
200; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
201; GFX10-WGP-NEXT:    s_endpgm
202;
203; GFX10-CU-LABEL: global_volatile_store_0:
204; GFX10-CU:       ; %bb.0: ; %entry
205; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
206; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
207; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
208; GFX10-CU-NEXT:    s_load_dword s0, s[0:1], 0x0
209; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
210; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
211; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
212; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
213; GFX10-CU-NEXT:    s_endpgm
214;
215; SKIP-CACHE-INV-LABEL: global_volatile_store_0:
216; SKIP-CACHE-INV:       ; %bb.0: ; %entry
217; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
218; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
219; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
220; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
221; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x0
222; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s2
223; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s3
224; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
225; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
226; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
227; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
228; SKIP-CACHE-INV-NEXT:    s_endpgm
229    i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
230entry:
231  %val = load i32, i32 addrspace(1)* %in, align 4
232  store volatile i32 %val, i32 addrspace(1)* %out
233  ret void
234}
235
236define amdgpu_kernel void @global_volatile_store_1(
237; GFX6-LABEL: global_volatile_store_1:
238; GFX6:       ; %bb.0: ; %entry
239; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
240; GFX6-NEXT:    s_mov_b32 s7, 0xf000
241; GFX6-NEXT:    s_mov_b32 s6, 0
242; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
243; GFX6-NEXT:    v_mov_b32_e32 v1, 0
244; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
245; GFX6-NEXT:    s_load_dword s0, s[0:1], 0x0
246; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
247; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
248; GFX6-NEXT:    v_mov_b32_e32 v2, s0
249; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
250; GFX6-NEXT:    s_waitcnt vmcnt(0)
251; GFX6-NEXT:    s_endpgm
252;
253; GFX7-LABEL: global_volatile_store_1:
254; GFX7:       ; %bb.0: ; %entry
255; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
256; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
257; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
258; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
259; GFX7-NEXT:    v_mov_b32_e32 v1, s3
260; GFX7-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
261; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
262; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
263; GFX7-NEXT:    v_mov_b32_e32 v2, s0
264; GFX7-NEXT:    flat_store_dword v[0:1], v2
265; GFX7-NEXT:    s_waitcnt vmcnt(0)
266; GFX7-NEXT:    s_endpgm
267;
268; GFX10-WGP-LABEL: global_volatile_store_1:
269; GFX10-WGP:       ; %bb.0: ; %entry
270; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
271; GFX10-WGP-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
272; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
273; GFX10-WGP-NEXT:    s_load_dword s0, s[0:1], 0x0
274; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
275; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
276; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
277; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
278; GFX10-WGP-NEXT:    s_endpgm
279;
280; GFX10-CU-LABEL: global_volatile_store_1:
281; GFX10-CU:       ; %bb.0: ; %entry
282; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
283; GFX10-CU-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
284; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
285; GFX10-CU-NEXT:    s_load_dword s0, s[0:1], 0x0
286; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
287; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
288; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
289; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
290; GFX10-CU-NEXT:    s_endpgm
291;
292; SKIP-CACHE-INV-LABEL: global_volatile_store_1:
293; SKIP-CACHE-INV:       ; %bb.0: ; %entry
294; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
295; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
296; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0
297; SKIP-CACHE-INV-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
298; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, 0
299; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
300; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x0
301; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
302; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
303; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
304; SKIP-CACHE-INV-NEXT:    buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
305; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
306; SKIP-CACHE-INV-NEXT:    s_endpgm
307    i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
308entry:
309  %tid = call i32 @llvm.amdgcn.workitem.id.x()
310  %val = load i32, i32 addrspace(1)* %in, align 4
311  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid
312  store volatile i32 %val, i32 addrspace(1)* %out.gep
313  ret void
314}
315
316define amdgpu_kernel void @global_volatile_workgroup_acquire_load(
317; GFX6-LABEL: global_volatile_workgroup_acquire_load:
318; GFX6:       ; %bb.0: ; %entry
319; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
320; GFX6-NEXT:    s_mov_b32 s3, 0xf000
321; GFX6-NEXT:    s_mov_b32 s2, -1
322; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
323; GFX6-NEXT:    s_mov_b32 s0, s4
324; GFX6-NEXT:    s_mov_b32 s1, s5
325; GFX6-NEXT:    buffer_load_dword v0, off, s[0:3], 0
326; GFX6-NEXT:    s_mov_b32 s4, s6
327; GFX6-NEXT:    s_mov_b32 s5, s7
328; GFX6-NEXT:    s_mov_b32 s6, s2
329; GFX6-NEXT:    s_mov_b32 s7, s3
330; GFX6-NEXT:    s_waitcnt vmcnt(0)
331; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
332; GFX6-NEXT:    s_endpgm
333;
334; GFX7-LABEL: global_volatile_workgroup_acquire_load:
335; GFX7:       ; %bb.0: ; %entry
336; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
337; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
338; GFX7-NEXT:    v_mov_b32_e32 v0, s0
339; GFX7-NEXT:    v_mov_b32_e32 v1, s1
340; GFX7-NEXT:    flat_load_dword v0, v[0:1]
341; GFX7-NEXT:    v_mov_b32_e32 v2, s2
342; GFX7-NEXT:    v_mov_b32_e32 v3, s3
343; GFX7-NEXT:    s_waitcnt vmcnt(0)
344; GFX7-NEXT:    flat_store_dword v[2:3], v0
345; GFX7-NEXT:    s_endpgm
346;
347; GFX10-WGP-LABEL: global_volatile_workgroup_acquire_load:
348; GFX10-WGP:       ; %bb.0: ; %entry
349; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
350; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
351; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
352; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1] glc
353; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
354; GFX10-WGP-NEXT:    buffer_gl0_inv
355; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
356; GFX10-WGP-NEXT:    s_endpgm
357;
358; GFX10-CU-LABEL: global_volatile_workgroup_acquire_load:
359; GFX10-CU:       ; %bb.0: ; %entry
360; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
361; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
362; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
363; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1]
364; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
365; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
366; GFX10-CU-NEXT:    s_endpgm
367;
368; SKIP-CACHE-INV-LABEL: global_volatile_workgroup_acquire_load:
369; SKIP-CACHE-INV:       ; %bb.0: ; %entry
370; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
371; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
372; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
373; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
374; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
375; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
376; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[0:3], 0
377; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s6
378; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s7
379; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s2
380; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s3
381; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
382; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
383; SKIP-CACHE-INV-NEXT:    s_endpgm
384    i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
385entry:
386  %val = load atomic volatile i32, i32 addrspace(1)* %in syncscope("workgroup") acquire, align 4
387  store i32 %val, i32 addrspace(1)* %out
388  ret void
389}
390
391define amdgpu_kernel void @global_volatile_workgroup_release_store(
392; GFX6-LABEL: global_volatile_workgroup_release_store:
393; GFX6:       ; %bb.0: ; %entry
394; GFX6-NEXT:    s_load_dword s4, s[0:1], 0x9
395; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
396; GFX6-NEXT:    s_mov_b32 s3, 0xf000
397; GFX6-NEXT:    s_mov_b32 s2, -1
398; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
399; GFX6-NEXT:    v_mov_b32_e32 v0, s4
400; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
401; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
402; GFX6-NEXT:    s_endpgm
403;
404; GFX7-LABEL: global_volatile_workgroup_release_store:
405; GFX7:       ; %bb.0: ; %entry
406; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
407; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
408; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
409; GFX7-NEXT:    v_mov_b32_e32 v2, s2
410; GFX7-NEXT:    v_mov_b32_e32 v0, s0
411; GFX7-NEXT:    v_mov_b32_e32 v1, s1
412; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
413; GFX7-NEXT:    flat_store_dword v[0:1], v2
414; GFX7-NEXT:    s_endpgm
415;
416; GFX10-WGP-LABEL: global_volatile_workgroup_release_store:
417; GFX10-WGP:       ; %bb.0: ; %entry
418; GFX10-WGP-NEXT:    s_clause 0x1
419; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
420; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
421; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
422; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
423; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
424; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
425; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
426; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
427; GFX10-WGP-NEXT:    s_endpgm
428;
429; GFX10-CU-LABEL: global_volatile_workgroup_release_store:
430; GFX10-CU:       ; %bb.0: ; %entry
431; GFX10-CU-NEXT:    s_clause 0x1
432; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
433; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
434; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
435; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
436; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
437; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
438; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
439; GFX10-CU-NEXT:    s_endpgm
440;
441; SKIP-CACHE-INV-LABEL: global_volatile_workgroup_release_store:
442; SKIP-CACHE-INV:       ; %bb.0: ; %entry
443; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x9
444; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
445; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
446; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
447; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
448; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
449; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
450; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
451; SKIP-CACHE-INV-NEXT:    s_endpgm
452   i32 %in, i32 addrspace(1)* %out) {
453entry:
454  store atomic volatile i32 %in, i32 addrspace(1)* %out syncscope("workgroup") release, align 4
455  ret void
456}
457
458declare i32 @llvm.amdgcn.workitem.id.x()
459