1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -mcpu=gfx900 -global-isel -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
3; RUN: llc -march=amdgcn -mcpu=gfx1030 -global-isel -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
4
5define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
6; GFX9-LABEL: store_load_sindex_kernel:
7; GFX9:       ; %bb.0: ; %bb
8; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
9; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
10; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
11; GFX9-NEXT:    v_mov_b32_e32 v0, 15
12; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
13; GFX9-NEXT:    s_lshl_b32 s1, s0, 2
14; GFX9-NEXT:    s_and_b32 s0, s0, 15
15; GFX9-NEXT:    s_add_i32 s1, s1, 4
16; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
17; GFX9-NEXT:    scratch_store_dword off, v0, s1
18; GFX9-NEXT:    s_waitcnt vmcnt(0)
19; GFX9-NEXT:    s_add_i32 s0, s0, 4
20; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
21; GFX9-NEXT:    s_waitcnt vmcnt(0)
22; GFX9-NEXT:    s_endpgm
23;
24; GFX10-LABEL: store_load_sindex_kernel:
25; GFX10:       ; %bb.0: ; %bb
26; GFX10-NEXT:    s_add_u32 s2, s2, s5
27; GFX10-NEXT:    s_addc_u32 s3, s3, 0
28; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
29; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
30; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x24
31; GFX10-NEXT:    v_mov_b32_e32 v0, 15
32; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
33; GFX10-NEXT:    s_and_b32 s1, s0, 15
34; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
35; GFX10-NEXT:    s_lshl_b32 s1, s1, 2
36; GFX10-NEXT:    s_add_i32 s0, s0, 4
37; GFX10-NEXT:    s_add_i32 s1, s1, 4
38; GFX10-NEXT:    scratch_store_dword off, v0, s0
39; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
40; GFX10-NEXT:    scratch_load_dword v0, off, s1 glc dlc
41; GFX10-NEXT:    s_waitcnt vmcnt(0)
42; GFX10-NEXT:    s_endpgm
43bb:
44  %i = alloca [32 x float], align 4, addrspace(5)
45  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
46  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
47  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
48  store volatile i32 15, i32 addrspace(5)* %i8, align 4
49  %i9 = and i32 %idx, 15
50  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
51  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
52  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
53  ret void
54}
55
56define amdgpu_kernel void @store_load_vindex_kernel() {
57; GFX9-LABEL: store_load_vindex_kernel:
58; GFX9:       ; %bb.0: ; %bb
59; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
60; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
61; GFX9-NEXT:    v_mov_b32_e32 v2, 4
62; GFX9-NEXT:    v_sub_u32_e32 v0, 0, v0
63; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
64; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
65; GFX9-NEXT:    v_mov_b32_e32 v3, 15
66; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
67; GFX9-NEXT:    scratch_store_dword v1, v3, off
68; GFX9-NEXT:    s_waitcnt vmcnt(0)
69; GFX9-NEXT:    v_add_u32_e32 v0, v2, v0
70; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
71; GFX9-NEXT:    s_waitcnt vmcnt(0)
72; GFX9-NEXT:    s_endpgm
73;
74; GFX10-LABEL: store_load_vindex_kernel:
75; GFX10:       ; %bb.0: ; %bb
76; GFX10-NEXT:    s_add_u32 s0, s0, s3
77; GFX10-NEXT:    s_addc_u32 s1, s1, 0
78; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
79; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
80; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 0, v0
81; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
82; GFX10-NEXT:    v_mov_b32_e32 v2, 4
83; GFX10-NEXT:    v_mov_b32_e32 v3, 15
84; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
85; GFX10-NEXT:    v_add_nc_u32_e32 v0, v2, v0
86; GFX10-NEXT:    v_add_nc_u32_e32 v1, v2, v1
87; GFX10-NEXT:    scratch_store_dword v0, v3, off
88; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
89; GFX10-NEXT:    scratch_load_dword v0, v1, off offset:124 glc dlc
90; GFX10-NEXT:    s_waitcnt vmcnt(0)
91; GFX10-NEXT:    s_endpgm
92bb:
93  %i = alloca [32 x float], align 4, addrspace(5)
94  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
95  %i2 = tail call i32 @llvm.amdgcn.workitem.id.x()
96  %i3 = zext i32 %i2 to i64
97  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2
98  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
99  store volatile i32 15, i32 addrspace(5)* %i8, align 4
100  %i9 = sub nsw i32 31, %i2
101  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
102  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
103  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
104  ret void
105}
106
107define void @store_load_vindex_foo(i32 %idx) {
108; GFX9-LABEL: store_load_vindex_foo:
109; GFX9:       ; %bb.0: ; %bb
110; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
111; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
112; GFX9-NEXT:    v_mov_b32_e32 v2, s32
113; GFX9-NEXT:    v_and_b32_e32 v0, 15, v0
114; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
115; GFX9-NEXT:    v_mov_b32_e32 v3, 15
116; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
117; GFX9-NEXT:    scratch_store_dword v1, v3, off
118; GFX9-NEXT:    s_waitcnt vmcnt(0)
119; GFX9-NEXT:    v_add_u32_e32 v0, v2, v0
120; GFX9-NEXT:    scratch_load_dword v0, v0, off glc
121; GFX9-NEXT:    s_waitcnt vmcnt(0)
122; GFX9-NEXT:    s_setpc_b64 s[30:31]
123;
124; GFX10-LABEL: store_load_vindex_foo:
125; GFX10:       ; %bb.0: ; %bb
126; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
127; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
128; GFX10-NEXT:    v_and_b32_e32 v1, 15, v0
129; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
130; GFX10-NEXT:    v_mov_b32_e32 v2, s32
131; GFX10-NEXT:    v_mov_b32_e32 v3, 15
132; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
133; GFX10-NEXT:    v_add_nc_u32_e32 v0, v2, v0
134; GFX10-NEXT:    v_add_nc_u32_e32 v1, v2, v1
135; GFX10-NEXT:    scratch_store_dword v0, v3, off
136; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
137; GFX10-NEXT:    scratch_load_dword v0, v1, off glc dlc
138; GFX10-NEXT:    s_waitcnt vmcnt(0)
139; GFX10-NEXT:    s_setpc_b64 s[30:31]
140bb:
141  %i = alloca [32 x float], align 4, addrspace(5)
142  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
143  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
144  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
145  store volatile i32 15, i32 addrspace(5)* %i8, align 4
146  %i9 = and i32 %idx, 15
147  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
148  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
149  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
150  ret void
151}
152
153define void @private_ptr_foo(float addrspace(5)* nocapture %arg) {
154; GFX9-LABEL: private_ptr_foo:
155; GFX9:       ; %bb.0:
156; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
157; GFX9-NEXT:    v_mov_b32_e32 v1, 0x41200000
158; GFX9-NEXT:    scratch_store_dword v0, v1, off offset:4
159; GFX9-NEXT:    s_waitcnt vmcnt(0)
160; GFX9-NEXT:    s_setpc_b64 s[30:31]
161;
162; GFX10-LABEL: private_ptr_foo:
163; GFX10:       ; %bb.0:
164; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
165; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
166; GFX10-NEXT:    v_mov_b32_e32 v1, 0x41200000
167; GFX10-NEXT:    scratch_store_dword v0, v1, off offset:4
168; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
169; GFX10-NEXT:    s_setpc_b64 s[30:31]
170  %gep = getelementptr inbounds float, float addrspace(5)* %arg, i32 1
171  store float 1.000000e+01, float addrspace(5)* %gep, align 4
172  ret void
173}
174
175define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
176; GFX9-LABEL: store_load_sindex_small_offset_kernel:
177; GFX9:       ; %bb.0: ; %bb
178; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
179; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
180; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
181; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
182; GFX9-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
183; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
184; GFX9-NEXT:    s_lshl_b32 s1, s0, 2
185; GFX9-NEXT:    s_and_b32 s0, s0, 15
186; GFX9-NEXT:    v_mov_b32_e32 v0, 15
187; GFX9-NEXT:    s_addk_i32 s1, 0x104
188; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
189; GFX9-NEXT:    scratch_store_dword off, v0, s1
190; GFX9-NEXT:    s_waitcnt vmcnt(0)
191; GFX9-NEXT:    s_addk_i32 s0, 0x104
192; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
193; GFX9-NEXT:    s_waitcnt vmcnt(0)
194; GFX9-NEXT:    s_endpgm
195;
196; GFX10-LABEL: store_load_sindex_small_offset_kernel:
197; GFX10:       ; %bb.0: ; %bb
198; GFX10-NEXT:    s_add_u32 s2, s2, s5
199; GFX10-NEXT:    s_addc_u32 s3, s3, 0
200; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
201; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
202; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x24
203; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
204; GFX10-NEXT:    s_waitcnt vmcnt(0)
205; GFX10-NEXT:    v_mov_b32_e32 v0, 15
206; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
207; GFX10-NEXT:    s_and_b32 s1, s0, 15
208; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
209; GFX10-NEXT:    s_lshl_b32 s1, s1, 2
210; GFX10-NEXT:    s_addk_i32 s0, 0x104
211; GFX10-NEXT:    s_addk_i32 s1, 0x104
212; GFX10-NEXT:    scratch_store_dword off, v0, s0
213; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
214; GFX10-NEXT:    scratch_load_dword v0, off, s1 glc dlc
215; GFX10-NEXT:    s_waitcnt vmcnt(0)
216; GFX10-NEXT:    s_endpgm
217bb:
218  %padding = alloca [64 x i32], align 4, addrspace(5)
219  %i = alloca [32 x float], align 4, addrspace(5)
220  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
221  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
222  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
223  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
224  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
225  store volatile i32 15, i32 addrspace(5)* %i8, align 4
226  %i9 = and i32 %idx, 15
227  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
228  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
229  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
230  ret void
231}
232
233define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
234; GFX9-LABEL: store_load_vindex_small_offset_kernel:
235; GFX9:       ; %bb.0: ; %bb
236; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
237; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
238; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
239; GFX9-NEXT:    scratch_load_dword v1, off, vcc_hi offset:4 glc
240; GFX9-NEXT:    s_waitcnt vmcnt(0)
241; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
242; GFX9-NEXT:    v_mov_b32_e32 v2, 0x104
243; GFX9-NEXT:    v_sub_u32_e32 v0, 0, v0
244; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
245; GFX9-NEXT:    v_mov_b32_e32 v3, 15
246; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
247; GFX9-NEXT:    scratch_store_dword v1, v3, off
248; GFX9-NEXT:    s_waitcnt vmcnt(0)
249; GFX9-NEXT:    v_add_u32_e32 v0, v2, v0
250; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
251; GFX9-NEXT:    s_waitcnt vmcnt(0)
252; GFX9-NEXT:    s_endpgm
253;
254; GFX10-LABEL: store_load_vindex_small_offset_kernel:
255; GFX10:       ; %bb.0: ; %bb
256; GFX10-NEXT:    s_add_u32 s0, s0, s3
257; GFX10-NEXT:    s_addc_u32 s1, s1, 0
258; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
259; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
260; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 0, v0
261; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
262; GFX10-NEXT:    v_mov_b32_e32 v2, 0x104
263; GFX10-NEXT:    v_mov_b32_e32 v3, 15
264; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
265; GFX10-NEXT:    v_add_nc_u32_e32 v0, v2, v0
266; GFX10-NEXT:    v_add_nc_u32_e32 v1, v2, v1
267; GFX10-NEXT:    scratch_load_dword v2, off, off offset:4 glc dlc
268; GFX10-NEXT:    s_waitcnt vmcnt(0)
269; GFX10-NEXT:    scratch_store_dword v0, v3, off
270; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
271; GFX10-NEXT:    scratch_load_dword v0, v1, off offset:124 glc dlc
272; GFX10-NEXT:    s_waitcnt vmcnt(0)
273; GFX10-NEXT:    s_endpgm
274bb:
275  %padding = alloca [64 x i32], align 4, addrspace(5)
276  %i = alloca [32 x float], align 4, addrspace(5)
277  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
278  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
279  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
280  %i2 = tail call i32 @llvm.amdgcn.workitem.id.x()
281  %i3 = zext i32 %i2 to i64
282  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2
283  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
284  store volatile i32 15, i32 addrspace(5)* %i8, align 4
285  %i9 = sub nsw i32 31, %i2
286  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
287  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
288  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
289  ret void
290}
291
292define void @store_load_vindex_small_offset_foo(i32 %idx) {
293; GFX9-LABEL: store_load_vindex_small_offset_foo:
294; GFX9:       ; %bb.0: ; %bb
295; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
296; GFX9-NEXT:    scratch_load_dword v1, off, s32 glc
297; GFX9-NEXT:    s_waitcnt vmcnt(0)
298; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 0x100
299; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
300; GFX9-NEXT:    v_mov_b32_e32 v2, vcc_hi
301; GFX9-NEXT:    v_and_b32_e32 v0, 15, v0
302; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
303; GFX9-NEXT:    v_mov_b32_e32 v3, 15
304; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
305; GFX9-NEXT:    scratch_store_dword v1, v3, off
306; GFX9-NEXT:    s_waitcnt vmcnt(0)
307; GFX9-NEXT:    v_add_u32_e32 v0, v2, v0
308; GFX9-NEXT:    scratch_load_dword v0, v0, off glc
309; GFX9-NEXT:    s_waitcnt vmcnt(0)
310; GFX9-NEXT:    s_setpc_b64 s[30:31]
311;
312; GFX10-LABEL: store_load_vindex_small_offset_foo:
313; GFX10:       ; %bb.0: ; %bb
314; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
315; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
316; GFX10-NEXT:    v_and_b32_e32 v1, 15, v0
317; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x100
318; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
319; GFX10-NEXT:    v_mov_b32_e32 v2, vcc_lo
320; GFX10-NEXT:    v_mov_b32_e32 v3, 15
321; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
322; GFX10-NEXT:    v_add_nc_u32_e32 v0, v2, v0
323; GFX10-NEXT:    v_add_nc_u32_e32 v1, v2, v1
324; GFX10-NEXT:    scratch_load_dword v2, off, s32 glc dlc
325; GFX10-NEXT:    s_waitcnt vmcnt(0)
326; GFX10-NEXT:    scratch_store_dword v0, v3, off
327; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
328; GFX10-NEXT:    scratch_load_dword v0, v1, off glc dlc
329; GFX10-NEXT:    s_waitcnt vmcnt(0)
330; GFX10-NEXT:    s_setpc_b64 s[30:31]
331bb:
332  %padding = alloca [64 x i32], align 4, addrspace(5)
333  %i = alloca [32 x float], align 4, addrspace(5)
334  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
335  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
336  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
337  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
338  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
339  store volatile i32 15, i32 addrspace(5)* %i8, align 4
340  %i9 = and i32 %idx, 15
341  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
342  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
343  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
344  ret void
345}
346
347define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
348; GFX9-LABEL: store_load_sindex_large_offset_kernel:
349; GFX9:       ; %bb.0: ; %bb
350; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
351; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
352; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
353; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
354; GFX9-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
355; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
356; GFX9-NEXT:    s_lshl_b32 s1, s0, 2
357; GFX9-NEXT:    s_and_b32 s0, s0, 15
358; GFX9-NEXT:    v_mov_b32_e32 v0, 15
359; GFX9-NEXT:    s_addk_i32 s1, 0x4004
360; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
361; GFX9-NEXT:    scratch_store_dword off, v0, s1
362; GFX9-NEXT:    s_waitcnt vmcnt(0)
363; GFX9-NEXT:    s_addk_i32 s0, 0x4004
364; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
365; GFX9-NEXT:    s_waitcnt vmcnt(0)
366; GFX9-NEXT:    s_endpgm
367;
368; GFX10-LABEL: store_load_sindex_large_offset_kernel:
369; GFX10:       ; %bb.0: ; %bb
370; GFX10-NEXT:    s_add_u32 s2, s2, s5
371; GFX10-NEXT:    s_addc_u32 s3, s3, 0
372; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
373; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
374; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x24
375; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
376; GFX10-NEXT:    s_waitcnt vmcnt(0)
377; GFX10-NEXT:    v_mov_b32_e32 v0, 15
378; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
379; GFX10-NEXT:    s_and_b32 s1, s0, 15
380; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
381; GFX10-NEXT:    s_lshl_b32 s1, s1, 2
382; GFX10-NEXT:    s_addk_i32 s0, 0x4004
383; GFX10-NEXT:    s_addk_i32 s1, 0x4004
384; GFX10-NEXT:    scratch_store_dword off, v0, s0
385; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
386; GFX10-NEXT:    scratch_load_dword v0, off, s1 glc dlc
387; GFX10-NEXT:    s_waitcnt vmcnt(0)
388; GFX10-NEXT:    s_endpgm
389bb:
390  %padding = alloca [4096 x i32], align 4, addrspace(5)
391  %i = alloca [32 x float], align 4, addrspace(5)
392  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
393  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
394  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
395  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
396  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
397  store volatile i32 15, i32 addrspace(5)* %i8, align 4
398  %i9 = and i32 %idx, 15
399  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
400  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
401  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
402  ret void
403}
404
405define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
406; GFX9-LABEL: store_load_vindex_large_offset_kernel:
407; GFX9:       ; %bb.0: ; %bb
408; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
409; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
410; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
411; GFX9-NEXT:    scratch_load_dword v1, off, vcc_hi offset:4 glc
412; GFX9-NEXT:    s_waitcnt vmcnt(0)
413; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
414; GFX9-NEXT:    v_mov_b32_e32 v2, 0x4004
415; GFX9-NEXT:    v_sub_u32_e32 v0, 0, v0
416; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
417; GFX9-NEXT:    v_mov_b32_e32 v3, 15
418; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
419; GFX9-NEXT:    scratch_store_dword v1, v3, off
420; GFX9-NEXT:    s_waitcnt vmcnt(0)
421; GFX9-NEXT:    v_add_u32_e32 v0, v2, v0
422; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
423; GFX9-NEXT:    s_waitcnt vmcnt(0)
424; GFX9-NEXT:    s_endpgm
425;
426; GFX10-LABEL: store_load_vindex_large_offset_kernel:
427; GFX10:       ; %bb.0: ; %bb
428; GFX10-NEXT:    s_add_u32 s0, s0, s3
429; GFX10-NEXT:    s_addc_u32 s1, s1, 0
430; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
431; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
432; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 0, v0
433; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
434; GFX10-NEXT:    v_mov_b32_e32 v2, 0x4004
435; GFX10-NEXT:    v_mov_b32_e32 v3, 15
436; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
437; GFX10-NEXT:    v_add_nc_u32_e32 v0, v2, v0
438; GFX10-NEXT:    v_add_nc_u32_e32 v1, v2, v1
439; GFX10-NEXT:    scratch_load_dword v2, off, off offset:4 glc dlc
440; GFX10-NEXT:    s_waitcnt vmcnt(0)
441; GFX10-NEXT:    scratch_store_dword v0, v3, off
442; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
443; GFX10-NEXT:    scratch_load_dword v0, v1, off offset:124 glc dlc
444; GFX10-NEXT:    s_waitcnt vmcnt(0)
445; GFX10-NEXT:    s_endpgm
446bb:
447  %padding = alloca [4096 x i32], align 4, addrspace(5)
448  %i = alloca [32 x float], align 4, addrspace(5)
449  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
450  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
451  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
452  %i2 = tail call i32 @llvm.amdgcn.workitem.id.x()
453  %i3 = zext i32 %i2 to i64
454  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2
455  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
456  store volatile i32 15, i32 addrspace(5)* %i8, align 4
457  %i9 = sub nsw i32 31, %i2
458  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
459  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
460  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
461  ret void
462}
463
464define void @store_load_vindex_large_offset_foo(i32 %idx) {
465; GFX9-LABEL: store_load_vindex_large_offset_foo:
466; GFX9:       ; %bb.0: ; %bb
467; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
468; GFX9-NEXT:    scratch_load_dword v1, off, s32 glc
469; GFX9-NEXT:    s_waitcnt vmcnt(0)
470; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 0x4000
471; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
472; GFX9-NEXT:    v_mov_b32_e32 v2, vcc_hi
473; GFX9-NEXT:    v_and_b32_e32 v0, 15, v0
474; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
475; GFX9-NEXT:    v_mov_b32_e32 v3, 15
476; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
477; GFX9-NEXT:    scratch_store_dword v1, v3, off
478; GFX9-NEXT:    s_waitcnt vmcnt(0)
479; GFX9-NEXT:    v_add_u32_e32 v0, v2, v0
480; GFX9-NEXT:    scratch_load_dword v0, v0, off glc
481; GFX9-NEXT:    s_waitcnt vmcnt(0)
482; GFX9-NEXT:    s_setpc_b64 s[30:31]
483;
484; GFX10-LABEL: store_load_vindex_large_offset_foo:
485; GFX10:       ; %bb.0: ; %bb
486; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
487; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
488; GFX10-NEXT:    v_and_b32_e32 v1, 15, v0
489; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x4000
490; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
491; GFX10-NEXT:    v_mov_b32_e32 v2, vcc_lo
492; GFX10-NEXT:    v_mov_b32_e32 v3, 15
493; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
494; GFX10-NEXT:    v_add_nc_u32_e32 v0, v2, v0
495; GFX10-NEXT:    v_add_nc_u32_e32 v1, v2, v1
496; GFX10-NEXT:    scratch_load_dword v2, off, s32 glc dlc
497; GFX10-NEXT:    s_waitcnt vmcnt(0)
498; GFX10-NEXT:    scratch_store_dword v0, v3, off
499; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
500; GFX10-NEXT:    scratch_load_dword v0, v1, off glc dlc
501; GFX10-NEXT:    s_waitcnt vmcnt(0)
502; GFX10-NEXT:    s_setpc_b64 s[30:31]
503bb:
504  %padding = alloca [4096 x i32], align 4, addrspace(5)
505  %i = alloca [32 x float], align 4, addrspace(5)
506  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
507  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
508  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
509  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
510  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
511  store volatile i32 15, i32 addrspace(5)* %i8, align 4
512  %i9 = and i32 %idx, 15
513  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
514  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
515  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
516  ret void
517}
518
519define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
520; GFX9-LABEL: store_load_large_imm_offset_kernel:
521; GFX9:       ; %bb.0: ; %bb
522; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
523; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
524; GFX9-NEXT:    v_mov_b32_e32 v0, 13
525; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
526; GFX9-NEXT:    s_movk_i32 s0, 0x3e80
527; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:4
528; GFX9-NEXT:    s_waitcnt vmcnt(0)
529; GFX9-NEXT:    v_mov_b32_e32 v0, 15
530; GFX9-NEXT:    s_add_i32 s0, s0, 4
531; GFX9-NEXT:    scratch_store_dword off, v0, s0
532; GFX9-NEXT:    s_waitcnt vmcnt(0)
533; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
534; GFX9-NEXT:    s_waitcnt vmcnt(0)
535; GFX9-NEXT:    s_endpgm
536;
537; GFX10-LABEL: store_load_large_imm_offset_kernel:
538; GFX10:       ; %bb.0: ; %bb
539; GFX10-NEXT:    s_add_u32 s0, s0, s3
540; GFX10-NEXT:    s_addc_u32 s1, s1, 0
541; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
542; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
543; GFX10-NEXT:    v_mov_b32_e32 v0, 13
544; GFX10-NEXT:    v_mov_b32_e32 v1, 15
545; GFX10-NEXT:    s_movk_i32 s0, 0x3e80
546; GFX10-NEXT:    s_add_i32 s0, s0, 4
547; GFX10-NEXT:    scratch_store_dword off, v0, off offset:4
548; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
549; GFX10-NEXT:    scratch_store_dword off, v1, s0
550; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
551; GFX10-NEXT:    scratch_load_dword v0, off, s0 glc dlc
552; GFX10-NEXT:    s_waitcnt vmcnt(0)
553; GFX10-NEXT:    s_endpgm
554bb:
555  %i = alloca [4096 x i32], align 4, addrspace(5)
556  %i1 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 undef
557  store volatile i32 13, i32 addrspace(5)* %i1, align 4
558  %i7 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000
559  store volatile i32 15, i32 addrspace(5)* %i7, align 4
560  %i10 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000
561  %i12 = load volatile i32, i32 addrspace(5)* %i10, align 4
562  ret void
563}
564
565define void @store_load_large_imm_offset_foo() {
566; GFX9-LABEL: store_load_large_imm_offset_foo:
567; GFX9:       ; %bb.0: ; %bb
568; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
569; GFX9-NEXT:    v_mov_b32_e32 v0, 13
570; GFX9-NEXT:    s_movk_i32 s0, 0x3e80
571; GFX9-NEXT:    scratch_store_dword off, v0, s32
572; GFX9-NEXT:    s_waitcnt vmcnt(0)
573; GFX9-NEXT:    v_mov_b32_e32 v0, 15
574; GFX9-NEXT:    s_add_i32 s0, s0, s32
575; GFX9-NEXT:    scratch_store_dword off, v0, s0
576; GFX9-NEXT:    s_waitcnt vmcnt(0)
577; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
578; GFX9-NEXT:    s_waitcnt vmcnt(0)
579; GFX9-NEXT:    s_setpc_b64 s[30:31]
580;
581; GFX10-LABEL: store_load_large_imm_offset_foo:
582; GFX10:       ; %bb.0: ; %bb
583; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
584; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
585; GFX10-NEXT:    v_mov_b32_e32 v0, 13
586; GFX10-NEXT:    v_mov_b32_e32 v1, 15
587; GFX10-NEXT:    s_movk_i32 s0, 0x3e80
588; GFX10-NEXT:    s_add_i32 s0, s0, s32
589; GFX10-NEXT:    scratch_store_dword off, v0, s32
590; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
591; GFX10-NEXT:    scratch_store_dword off, v1, s0
592; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
593; GFX10-NEXT:    scratch_load_dword v0, off, s0 glc dlc
594; GFX10-NEXT:    s_waitcnt vmcnt(0)
595; GFX10-NEXT:    s_setpc_b64 s[30:31]
596bb:
597  %i = alloca [4096 x i32], align 4, addrspace(5)
598  %i1 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 undef
599  store volatile i32 13, i32 addrspace(5)* %i1, align 4
600  %i7 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000
601  store volatile i32 15, i32 addrspace(5)* %i7, align 4
602  %i10 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000
603  %i12 = load volatile i32, i32 addrspace(5)* %i10, align 4
604  ret void
605}
606
607define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
608; GFX9-LABEL: store_load_vidx_sidx_offset:
609; GFX9:       ; %bb.0: ; %bb
610; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
611; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
612; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
613; GFX9-NEXT:    v_mov_b32_e32 v1, 15
614; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
615; GFX9-NEXT:    v_add_lshl_u32 v0, s0, v0, 2
616; GFX9-NEXT:    v_add_u32_e32 v0, 4, v0
617; GFX9-NEXT:    scratch_store_dword v0, v1, off offset:1024
618; GFX9-NEXT:    s_waitcnt vmcnt(0)
619; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:1024 glc
620; GFX9-NEXT:    s_waitcnt vmcnt(0)
621; GFX9-NEXT:    s_endpgm
622;
623; GFX10-LABEL: store_load_vidx_sidx_offset:
624; GFX10:       ; %bb.0: ; %bb
625; GFX10-NEXT:    s_add_u32 s2, s2, s5
626; GFX10-NEXT:    s_addc_u32 s3, s3, 0
627; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
628; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
629; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x24
630; GFX10-NEXT:    v_mov_b32_e32 v1, 15
631; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
632; GFX10-NEXT:    v_add_lshl_u32 v0, s0, v0, 2
633; GFX10-NEXT:    v_add_nc_u32_e32 v0, 4, v0
634; GFX10-NEXT:    scratch_store_dword v0, v1, off offset:1024
635; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
636; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:1024 glc dlc
637; GFX10-NEXT:    s_waitcnt vmcnt(0)
638; GFX10-NEXT:    s_endpgm
639bb:
640  %alloca = alloca [32 x i32], align 4, addrspace(5)
641  %vidx = tail call i32 @llvm.amdgcn.workitem.id.x()
642  %add1 = add nsw i32 %sidx, %vidx
643  %add2 = add nsw i32 %add1, 256
644  %gep = getelementptr inbounds [32 x i32], [32 x i32] addrspace(5)* %alloca, i32 0, i32 %add2
645  store volatile i32 15, i32 addrspace(5)* %gep, align 4
646  %load = load volatile i32, i32 addrspace(5)* %gep, align 4
647  ret void
648}
649
650define void @store_load_i64_aligned(i64 addrspace(5)* nocapture %arg) {
651; GFX9-LABEL: store_load_i64_aligned:
652; GFX9:       ; %bb.0: ; %bb
653; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
654; GFX9-NEXT:    v_mov_b32_e32 v1, 15
655; GFX9-NEXT:    v_mov_b32_e32 v2, 0
656; GFX9-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
657; GFX9-NEXT:    s_waitcnt vmcnt(0)
658; GFX9-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc
659; GFX9-NEXT:    s_waitcnt vmcnt(0)
660; GFX9-NEXT:    s_setpc_b64 s[30:31]
661;
662; GFX10-LABEL: store_load_i64_aligned:
663; GFX10:       ; %bb.0: ; %bb
664; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
665; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
666; GFX10-NEXT:    v_mov_b32_e32 v1, 15
667; GFX10-NEXT:    v_mov_b32_e32 v2, 0
668; GFX10-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
669; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
670; GFX10-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc dlc
671; GFX10-NEXT:    s_waitcnt vmcnt(0)
672; GFX10-NEXT:    s_setpc_b64 s[30:31]
673bb:
674  store volatile i64 15, i64 addrspace(5)* %arg, align 8
675  %load = load volatile i64, i64 addrspace(5)* %arg, align 8
676  ret void
677}
678
679define void @store_load_i64_unaligned(i64 addrspace(5)* nocapture %arg) {
680; GFX9-LABEL: store_load_i64_unaligned:
681; GFX9:       ; %bb.0: ; %bb
682; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
683; GFX9-NEXT:    v_mov_b32_e32 v1, 15
684; GFX9-NEXT:    v_mov_b32_e32 v2, 0
685; GFX9-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
686; GFX9-NEXT:    s_waitcnt vmcnt(0)
687; GFX9-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc
688; GFX9-NEXT:    s_waitcnt vmcnt(0)
689; GFX9-NEXT:    s_setpc_b64 s[30:31]
690;
691; GFX10-LABEL: store_load_i64_unaligned:
692; GFX10:       ; %bb.0: ; %bb
693; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
694; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
695; GFX10-NEXT:    v_mov_b32_e32 v1, 15
696; GFX10-NEXT:    v_mov_b32_e32 v2, 0
697; GFX10-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
698; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
699; GFX10-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc dlc
700; GFX10-NEXT:    s_waitcnt vmcnt(0)
701; GFX10-NEXT:    s_setpc_b64 s[30:31]
702bb:
703  store volatile i64 15, i64 addrspace(5)* %arg, align 1
704  %load = load volatile i64, i64 addrspace(5)* %arg, align 1
705  ret void
706}
707
708define void @store_load_v3i32_unaligned(<3 x i32> addrspace(5)* nocapture %arg) {
709; GFX9-LABEL: store_load_v3i32_unaligned:
710; GFX9:       ; %bb.0: ; %bb
711; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
712; GFX9-NEXT:    s_mov_b32 s2, 3
713; GFX9-NEXT:    s_mov_b32 s1, 2
714; GFX9-NEXT:    s_mov_b32 s0, 1
715; GFX9-NEXT:    v_mov_b32_e32 v3, s2
716; GFX9-NEXT:    v_mov_b32_e32 v2, s1
717; GFX9-NEXT:    v_mov_b32_e32 v1, s0
718; GFX9-NEXT:    scratch_store_dwordx3 v0, v[1:3], off
719; GFX9-NEXT:    s_waitcnt vmcnt(0)
720; GFX9-NEXT:    scratch_load_dwordx3 v[0:2], v0, off glc
721; GFX9-NEXT:    s_waitcnt vmcnt(0)
722; GFX9-NEXT:    s_setpc_b64 s[30:31]
723;
724; GFX10-LABEL: store_load_v3i32_unaligned:
725; GFX10:       ; %bb.0: ; %bb
726; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
727; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
728; GFX10-NEXT:    s_mov_b32 s2, 3
729; GFX10-NEXT:    s_mov_b32 s1, 2
730; GFX10-NEXT:    s_mov_b32 s0, 1
731; GFX10-NEXT:    v_mov_b32_e32 v3, s2
732; GFX10-NEXT:    v_mov_b32_e32 v2, s1
733; GFX10-NEXT:    v_mov_b32_e32 v1, s0
734; GFX10-NEXT:    scratch_store_dwordx3 v0, v[1:3], off
735; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
736; GFX10-NEXT:    scratch_load_dwordx3 v[0:2], v0, off glc dlc
737; GFX10-NEXT:    s_waitcnt vmcnt(0)
738; GFX10-NEXT:    s_setpc_b64 s[30:31]
739bb:
740  store volatile <3 x i32> <i32 1, i32 2, i32 3>, <3 x i32> addrspace(5)* %arg, align 1
741  %load = load volatile <3 x i32>, <3 x i32> addrspace(5)* %arg, align 1
742  ret void
743}
744
745define void @store_load_v4i32_unaligned(<4 x i32> addrspace(5)* nocapture %arg) {
746; GFX9-LABEL: store_load_v4i32_unaligned:
747; GFX9:       ; %bb.0: ; %bb
748; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
749; GFX9-NEXT:    s_mov_b32 s3, 4
750; GFX9-NEXT:    s_mov_b32 s2, 3
751; GFX9-NEXT:    s_mov_b32 s1, 2
752; GFX9-NEXT:    s_mov_b32 s0, 1
753; GFX9-NEXT:    v_mov_b32_e32 v4, s3
754; GFX9-NEXT:    v_mov_b32_e32 v3, s2
755; GFX9-NEXT:    v_mov_b32_e32 v2, s1
756; GFX9-NEXT:    v_mov_b32_e32 v1, s0
757; GFX9-NEXT:    scratch_store_dwordx4 v0, v[1:4], off
758; GFX9-NEXT:    s_waitcnt vmcnt(0)
759; GFX9-NEXT:    scratch_load_dwordx4 v[0:3], v0, off glc
760; GFX9-NEXT:    s_waitcnt vmcnt(0)
761; GFX9-NEXT:    s_setpc_b64 s[30:31]
762;
763; GFX10-LABEL: store_load_v4i32_unaligned:
764; GFX10:       ; %bb.0: ; %bb
765; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
766; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
767; GFX10-NEXT:    s_mov_b32 s3, 4
768; GFX10-NEXT:    s_mov_b32 s2, 3
769; GFX10-NEXT:    s_mov_b32 s1, 2
770; GFX10-NEXT:    s_mov_b32 s0, 1
771; GFX10-NEXT:    v_mov_b32_e32 v4, s3
772; GFX10-NEXT:    v_mov_b32_e32 v3, s2
773; GFX10-NEXT:    v_mov_b32_e32 v2, s1
774; GFX10-NEXT:    v_mov_b32_e32 v1, s0
775; GFX10-NEXT:    scratch_store_dwordx4 v0, v[1:4], off
776; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
777; GFX10-NEXT:    scratch_load_dwordx4 v[0:3], v0, off glc dlc
778; GFX10-NEXT:    s_waitcnt vmcnt(0)
779; GFX10-NEXT:    s_setpc_b64 s[30:31]
780bb:
781  store volatile <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(5)* %arg, align 1
782  %load = load volatile <4 x i32>, <4 x i32> addrspace(5)* %arg, align 1
783  ret void
784}
785
786declare i32 @llvm.amdgcn.workitem.id.x()
787