1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
3; RUN: llc -march=amdgcn -mcpu=gfx1030 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
4; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9-PAL %s
5; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-PAL,GFX1010-PAL %s
6; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-PAL,GFX1030-PAL %s
7
8define amdgpu_kernel void @zero_init_kernel() {
9; GFX9-LABEL: zero_init_kernel:
10; GFX9:       ; %bb.0:
11; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
12; GFX9-NEXT:    s_mov_b32 s0, 0
13; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
14; GFX9-NEXT:    s_mov_b32 s1, s0
15; GFX9-NEXT:    s_mov_b32 s2, s0
16; GFX9-NEXT:    s_mov_b32 s3, s0
17; GFX9-NEXT:    v_mov_b32_e32 v0, s0
18; GFX9-NEXT:    v_mov_b32_e32 v1, s1
19; GFX9-NEXT:    v_mov_b32_e32 v2, s2
20; GFX9-NEXT:    v_mov_b32_e32 v3, s3
21; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
22; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:64
23; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
24; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
25; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
26; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32
27; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
28; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
29; GFX9-NEXT:    s_endpgm
30;
31; GFX10-LABEL: zero_init_kernel:
32; GFX10:       ; %bb.0:
33; GFX10-NEXT:    s_add_u32 s0, s0, s3
34; GFX10-NEXT:    s_addc_u32 s1, s1, 0
35; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
36; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
37; GFX10-NEXT:    s_mov_b32 s0, 0
38; GFX10-NEXT:    s_mov_b32 s1, s0
39; GFX10-NEXT:    s_mov_b32 s2, s0
40; GFX10-NEXT:    s_mov_b32 s3, s0
41; GFX10-NEXT:    v_mov_b32_e32 v0, s0
42; GFX10-NEXT:    v_mov_b32_e32 v1, s1
43; GFX10-NEXT:    v_mov_b32_e32 v2, s2
44; GFX10-NEXT:    v_mov_b32_e32 v3, s3
45; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:64
46; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:48
47; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:32
48; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:16
49; GFX10-NEXT:    s_endpgm
50;
51; GFX9-PAL-LABEL: zero_init_kernel:
52; GFX9-PAL:       ; %bb.0:
53; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
54; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
55; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
56; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
57; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
58; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
59; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
60; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
61; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
62; GFX9-PAL-NEXT:    s_mov_b32 s1, s0
63; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
64; GFX9-PAL-NEXT:    s_mov_b32 s3, s0
65; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, s0
66; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
67; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
68; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, s3
69; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:64
70; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
71; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
72; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
73; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32
74; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
75; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
76; GFX9-PAL-NEXT:    s_endpgm
77;
78; GFX1010-PAL-LABEL: zero_init_kernel:
79; GFX1010-PAL:       ; %bb.0:
80; GFX1010-PAL-NEXT:    s_getpc_b64 s[2:3]
81; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
82; GFX1010-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
83; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
84; GFX1010-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
85; GFX1010-PAL-NEXT:    s_add_u32 s2, s2, s1
86; GFX1010-PAL-NEXT:    s_addc_u32 s3, s3, 0
87; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
88; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
89; GFX1010-PAL-NEXT:    s_mov_b32 s0, 0
90; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
91; GFX1010-PAL-NEXT:    s_mov_b32 s1, s0
92; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
93; GFX1010-PAL-NEXT:    s_mov_b32 s3, s0
94; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, s0
95; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, s1
96; GFX1010-PAL-NEXT:    v_mov_b32_e32 v2, s2
97; GFX1010-PAL-NEXT:    v_mov_b32_e32 v3, s3
98; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:64
99; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
100; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
101; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
102; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
103; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
104; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
105; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
106; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
107; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
108; GFX1010-PAL-NEXT:    s_endpgm
109;
110; GFX1030-PAL-LABEL: zero_init_kernel:
111; GFX1030-PAL:       ; %bb.0:
112; GFX1030-PAL-NEXT:    s_getpc_b64 s[2:3]
113; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
114; GFX1030-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
115; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
116; GFX1030-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
117; GFX1030-PAL-NEXT:    s_add_u32 s2, s2, s1
118; GFX1030-PAL-NEXT:    s_addc_u32 s3, s3, 0
119; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
120; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
121; GFX1030-PAL-NEXT:    s_mov_b32 s0, 0
122; GFX1030-PAL-NEXT:    s_mov_b32 s1, s0
123; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
124; GFX1030-PAL-NEXT:    s_mov_b32 s3, s0
125; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, s0
126; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, s1
127; GFX1030-PAL-NEXT:    v_mov_b32_e32 v2, s2
128; GFX1030-PAL-NEXT:    v_mov_b32_e32 v3, s3
129; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:64
130; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:48
131; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:32
132; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:16
133; GFX1030-PAL-NEXT:    s_endpgm
134  %alloca = alloca [32 x i16], align 2, addrspace(5)
135  %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
136  call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
137  ret void
138}
139
140define void @zero_init_foo() {
141; GFX9-LABEL: zero_init_foo:
142; GFX9:       ; %bb.0:
143; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
144; GFX9-NEXT:    s_mov_b32 s0, 0
145; GFX9-NEXT:    s_mov_b32 s1, s0
146; GFX9-NEXT:    s_mov_b32 s2, s0
147; GFX9-NEXT:    s_mov_b32 s3, s0
148; GFX9-NEXT:    v_mov_b32_e32 v0, s0
149; GFX9-NEXT:    v_mov_b32_e32 v1, s1
150; GFX9-NEXT:    v_mov_b32_e32 v2, s2
151; GFX9-NEXT:    v_mov_b32_e32 v3, s3
152; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:48
153; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:32
154; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:16
155; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
156; GFX9-NEXT:    s_waitcnt vmcnt(0)
157; GFX9-NEXT:    s_setpc_b64 s[30:31]
158;
159; GFX10-LABEL: zero_init_foo:
160; GFX10:       ; %bb.0:
161; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
162; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
163; GFX10-NEXT:    s_mov_b32 s0, 0
164; GFX10-NEXT:    s_mov_b32 s1, s0
165; GFX10-NEXT:    s_mov_b32 s2, s0
166; GFX10-NEXT:    s_mov_b32 s3, s0
167; GFX10-NEXT:    v_mov_b32_e32 v0, s0
168; GFX10-NEXT:    v_mov_b32_e32 v1, s1
169; GFX10-NEXT:    v_mov_b32_e32 v2, s2
170; GFX10-NEXT:    v_mov_b32_e32 v3, s3
171; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:48
172; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:32
173; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:16
174; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
175; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
176; GFX10-NEXT:    s_setpc_b64 s[30:31]
177;
178; GFX9-PAL-LABEL: zero_init_foo:
179; GFX9-PAL:       ; %bb.0:
180; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
181; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
182; GFX9-PAL-NEXT:    s_mov_b32 s1, s0
183; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
184; GFX9-PAL-NEXT:    s_mov_b32 s3, s0
185; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, s0
186; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
187; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
188; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, s3
189; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:48
190; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:32
191; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:16
192; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
193; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
194; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
195;
196; GFX10-PAL-LABEL: zero_init_foo:
197; GFX10-PAL:       ; %bb.0:
198; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
199; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
200; GFX10-PAL-NEXT:    s_mov_b32 s0, 0
201; GFX10-PAL-NEXT:    s_mov_b32 s1, s0
202; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
203; GFX10-PAL-NEXT:    s_mov_b32 s3, s0
204; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, s0
205; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, s1
206; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, s2
207; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, s3
208; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:48
209; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:32
210; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:16
211; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
212; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
213; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
214  %alloca = alloca [32 x i16], align 2, addrspace(5)
215  %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
216  call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
217  ret void
218}
219
220define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
221; GFX9-LABEL: store_load_sindex_kernel:
222; GFX9:       ; %bb.0: ; %bb
223; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
224; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
225; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
226; GFX9-NEXT:    v_mov_b32_e32 v0, 15
227; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
228; GFX9-NEXT:    s_lshl_b32 s1, s0, 2
229; GFX9-NEXT:    s_and_b32 s0, s0, 15
230; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
231; GFX9-NEXT:    s_add_i32 s1, s1, 4
232; GFX9-NEXT:    scratch_store_dword off, v0, s1
233; GFX9-NEXT:    s_waitcnt vmcnt(0)
234; GFX9-NEXT:    s_add_i32 s0, s0, 4
235; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
236; GFX9-NEXT:    s_waitcnt vmcnt(0)
237; GFX9-NEXT:    s_endpgm
238;
239; GFX10-LABEL: store_load_sindex_kernel:
240; GFX10:       ; %bb.0: ; %bb
241; GFX10-NEXT:    s_add_u32 s2, s2, s5
242; GFX10-NEXT:    s_addc_u32 s3, s3, 0
243; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
244; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
245; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x24
246; GFX10-NEXT:    v_mov_b32_e32 v0, 15
247; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
248; GFX10-NEXT:    s_and_b32 s1, s0, 15
249; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
250; GFX10-NEXT:    s_lshl_b32 s1, s1, 2
251; GFX10-NEXT:    s_add_i32 s0, s0, 4
252; GFX10-NEXT:    s_add_i32 s1, s1, 4
253; GFX10-NEXT:    scratch_store_dword off, v0, s0
254; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
255; GFX10-NEXT:    scratch_load_dword v0, off, s1 glc dlc
256; GFX10-NEXT:    s_waitcnt vmcnt(0)
257; GFX10-NEXT:    s_endpgm
258;
259; GFX9-PAL-LABEL: store_load_sindex_kernel:
260; GFX9-PAL:       ; %bb.0: ; %bb
261; GFX9-PAL-NEXT:    s_getpc_b64 s[4:5]
262; GFX9-PAL-NEXT:    s_mov_b32 s4, s0
263; GFX9-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
264; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
265; GFX9-PAL-NEXT:    s_load_dword s0, s[0:1], 0x24
266; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
267; GFX9-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
268; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s4, s3
269; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
270; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
271; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
272; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
273; GFX9-PAL-NEXT:    s_add_i32 s1, s1, 4
274; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
275; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
276; GFX9-PAL-NEXT:    s_add_i32 s0, s0, 4
277; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 glc
278; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
279; GFX9-PAL-NEXT:    s_endpgm
280;
281; GFX10-PAL-LABEL: store_load_sindex_kernel:
282; GFX10-PAL:       ; %bb.0: ; %bb
283; GFX10-PAL-NEXT:    s_getpc_b64 s[4:5]
284; GFX10-PAL-NEXT:    s_mov_b32 s4, s0
285; GFX10-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
286; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
287; GFX10-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
288; GFX10-PAL-NEXT:    s_add_u32 s4, s4, s3
289; GFX10-PAL-NEXT:    s_addc_u32 s5, s5, 0
290; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
291; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
292; GFX10-PAL-NEXT:    s_load_dword s0, s[0:1], 0x24
293; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, 15
294; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
295; GFX10-PAL-NEXT:    s_and_b32 s1, s0, 15
296; GFX10-PAL-NEXT:    s_lshl_b32 s0, s0, 2
297; GFX10-PAL-NEXT:    s_lshl_b32 s1, s1, 2
298; GFX10-PAL-NEXT:    s_add_i32 s0, s0, 4
299; GFX10-PAL-NEXT:    s_add_i32 s1, s1, 4
300; GFX10-PAL-NEXT:    scratch_store_dword off, v0, s0
301; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
302; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
303; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
304; GFX10-PAL-NEXT:    s_endpgm
305bb:
306  %i = alloca [32 x float], align 4, addrspace(5)
307  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
308  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
309  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
310  store volatile i32 15, i32 addrspace(5)* %i8, align 4
311  %i9 = and i32 %idx, 15
312  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
313  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
314  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
315  ret void
316}
317
318define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) {
319; GFX9-LABEL: store_load_sindex_foo:
320; GFX9:       ; %bb.0: ; %bb
321; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
322; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
323; GFX9-NEXT:    s_lshl_b32 s0, s2, 2
324; GFX9-NEXT:    s_add_i32 s0, s0, 4
325; GFX9-NEXT:    v_mov_b32_e32 v0, 15
326; GFX9-NEXT:    scratch_store_dword off, v0, s0
327; GFX9-NEXT:    s_waitcnt vmcnt(0)
328; GFX9-NEXT:    s_and_b32 s0, s2, 15
329; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
330; GFX9-NEXT:    s_add_i32 s0, s0, 4
331; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
332; GFX9-NEXT:    s_waitcnt vmcnt(0)
333; GFX9-NEXT:    s_endpgm
334;
335; GFX10-LABEL: store_load_sindex_foo:
336; GFX10:       ; %bb.0: ; %bb
337; GFX10-NEXT:    s_add_u32 s0, s0, s3
338; GFX10-NEXT:    s_addc_u32 s1, s1, 0
339; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
340; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
341; GFX10-NEXT:    s_and_b32 s0, s2, 15
342; GFX10-NEXT:    v_mov_b32_e32 v0, 15
343; GFX10-NEXT:    s_lshl_b32 s1, s2, 2
344; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
345; GFX10-NEXT:    s_add_i32 s1, s1, 4
346; GFX10-NEXT:    s_add_i32 s0, s0, 4
347; GFX10-NEXT:    scratch_store_dword off, v0, s1
348; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
349; GFX10-NEXT:    scratch_load_dword v0, off, s0 glc dlc
350; GFX10-NEXT:    s_waitcnt vmcnt(0)
351; GFX10-NEXT:    s_endpgm
352;
353; GFX9-PAL-LABEL: store_load_sindex_foo:
354; GFX9-PAL:       ; %bb.0: ; %bb
355; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
356; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
357; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
358; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
359; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
360; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
361; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
362; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
363; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
364; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
365; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
366; GFX9-PAL-NEXT:    s_add_i32 s1, s1, 4
367; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
368; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
369; GFX9-PAL-NEXT:    s_add_i32 s0, s0, 4
370; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 glc
371; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
372; GFX9-PAL-NEXT:    s_endpgm
373;
374; GFX10-PAL-LABEL: store_load_sindex_foo:
375; GFX10-PAL:       ; %bb.0: ; %bb
376; GFX10-PAL-NEXT:    s_getpc_b64 s[2:3]
377; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
378; GFX10-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
379; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
380; GFX10-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
381; GFX10-PAL-NEXT:    s_add_u32 s2, s2, s1
382; GFX10-PAL-NEXT:    s_addc_u32 s3, s3, 0
383; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
384; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
385; GFX10-PAL-NEXT:    s_and_b32 s1, s0, 15
386; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, 15
387; GFX10-PAL-NEXT:    s_lshl_b32 s0, s0, 2
388; GFX10-PAL-NEXT:    s_lshl_b32 s1, s1, 2
389; GFX10-PAL-NEXT:    s_add_i32 s0, s0, 4
390; GFX10-PAL-NEXT:    s_add_i32 s1, s1, 4
391; GFX10-PAL-NEXT:    scratch_store_dword off, v0, s0
392; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
393; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
394; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
395; GFX10-PAL-NEXT:    s_endpgm
396bb:
397  %i = alloca [32 x float], align 4, addrspace(5)
398  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
399  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
400  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
401  store volatile i32 15, i32 addrspace(5)* %i8, align 4
402  %i9 = and i32 %idx, 15
403  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
404  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
405  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
406  ret void
407}
408
409define amdgpu_kernel void @store_load_vindex_kernel() {
410; GFX9-LABEL: store_load_vindex_kernel:
411; GFX9:       ; %bb.0: ; %bb
412; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
413; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
414; GFX9-NEXT:    v_mov_b32_e32 v1, 4
415; GFX9-NEXT:    v_add_u32_e32 v2, v1, v0
416; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
417; GFX9-NEXT:    v_mov_b32_e32 v3, 15
418; GFX9-NEXT:    scratch_store_dword v2, v3, off
419; GFX9-NEXT:    s_waitcnt vmcnt(0)
420; GFX9-NEXT:    v_sub_u32_e32 v0, v1, v0
421; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
422; GFX9-NEXT:    s_waitcnt vmcnt(0)
423; GFX9-NEXT:    s_endpgm
424;
425; GFX10-LABEL: store_load_vindex_kernel:
426; GFX10:       ; %bb.0: ; %bb
427; GFX10-NEXT:    s_add_u32 s0, s0, s3
428; GFX10-NEXT:    s_addc_u32 s1, s1, 0
429; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
430; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
431; GFX10-NEXT:    v_mov_b32_e32 v1, 4
432; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
433; GFX10-NEXT:    v_mov_b32_e32 v3, 15
434; GFX10-NEXT:    v_add_nc_u32_e32 v2, v1, v0
435; GFX10-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
436; GFX10-NEXT:    scratch_store_dword v2, v3, off
437; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
438; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
439; GFX10-NEXT:    s_waitcnt vmcnt(0)
440; GFX10-NEXT:    s_endpgm
441;
442; GFX9-PAL-LABEL: store_load_vindex_kernel:
443; GFX9-PAL:       ; %bb.0: ; %bb
444; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
445; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
446; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
447; GFX9-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
448; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 4
449; GFX9-PAL-NEXT:    v_add_u32_e32 v2, v1, v0
450; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 15
451; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
452; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
453; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
454; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
455; GFX9-PAL-NEXT:    scratch_store_dword v2, v3, off
456; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
457; GFX9-PAL-NEXT:    v_sub_u32_e32 v0, v1, v0
458; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
459; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
460; GFX9-PAL-NEXT:    s_endpgm
461;
462; GFX10-PAL-LABEL: store_load_vindex_kernel:
463; GFX10-PAL:       ; %bb.0: ; %bb
464; GFX10-PAL-NEXT:    s_getpc_b64 s[2:3]
465; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
466; GFX10-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
467; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
468; GFX10-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
469; GFX10-PAL-NEXT:    s_add_u32 s2, s2, s1
470; GFX10-PAL-NEXT:    s_addc_u32 s3, s3, 0
471; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
472; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
473; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 4
474; GFX10-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
475; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, 15
476; GFX10-PAL-NEXT:    v_add_nc_u32_e32 v2, v1, v0
477; GFX10-PAL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
478; GFX10-PAL-NEXT:    scratch_store_dword v2, v3, off
479; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
480; GFX10-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
481; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
482; GFX10-PAL-NEXT:    s_endpgm
483bb:
484  %i = alloca [32 x float], align 4, addrspace(5)
485  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
486  %i2 = tail call i32 @llvm.amdgcn.workitem.id.x()
487  %i3 = zext i32 %i2 to i64
488  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2
489  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
490  store volatile i32 15, i32 addrspace(5)* %i8, align 4
491  %i9 = sub nsw i32 31, %i2
492  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
493  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
494  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
495  ret void
496}
497
498define void @store_load_vindex_foo(i32 %idx) {
499; GFX9-LABEL: store_load_vindex_foo:
500; GFX9:       ; %bb.0: ; %bb
501; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
502; GFX9-NEXT:    v_mov_b32_e32 v1, s32
503; GFX9-NEXT:    v_mov_b32_e32 v3, 15
504; GFX9-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
505; GFX9-NEXT:    v_and_b32_e32 v0, v0, v3
506; GFX9-NEXT:    scratch_store_dword v2, v3, off
507; GFX9-NEXT:    s_waitcnt vmcnt(0)
508; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
509; GFX9-NEXT:    scratch_load_dword v0, v0, off glc
510; GFX9-NEXT:    s_waitcnt vmcnt(0)
511; GFX9-NEXT:    s_setpc_b64 s[30:31]
512;
513; GFX10-LABEL: store_load_vindex_foo:
514; GFX10:       ; %bb.0: ; %bb
515; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
516; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
517; GFX10-NEXT:    v_mov_b32_e32 v1, 15
518; GFX10-NEXT:    v_mov_b32_e32 v2, s32
519; GFX10-NEXT:    v_and_b32_e32 v3, v0, v1
520; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, v2
521; GFX10-NEXT:    v_lshl_add_u32 v2, v3, 2, v2
522; GFX10-NEXT:    scratch_store_dword v0, v1, off
523; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
524; GFX10-NEXT:    scratch_load_dword v0, v2, off glc dlc
525; GFX10-NEXT:    s_waitcnt vmcnt(0)
526; GFX10-NEXT:    s_setpc_b64 s[30:31]
527;
528; GFX9-PAL-LABEL: store_load_vindex_foo:
529; GFX9-PAL:       ; %bb.0: ; %bb
530; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
531; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s32
532; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 15
533; GFX9-PAL-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
534; GFX9-PAL-NEXT:    v_and_b32_e32 v0, v0, v3
535; GFX9-PAL-NEXT:    scratch_store_dword v2, v3, off
536; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
537; GFX9-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
538; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off glc
539; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
540; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
541;
542; GFX10-PAL-LABEL: store_load_vindex_foo:
543; GFX10-PAL:       ; %bb.0: ; %bb
544; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
545; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
546; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
547; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, s32
548; GFX10-PAL-NEXT:    v_and_b32_e32 v3, v0, v1
549; GFX10-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v2
550; GFX10-PAL-NEXT:    v_lshl_add_u32 v2, v3, 2, v2
551; GFX10-PAL-NEXT:    scratch_store_dword v0, v1, off
552; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
553; GFX10-PAL-NEXT:    scratch_load_dword v0, v2, off glc dlc
554; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
555; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
556bb:
557  %i = alloca [32 x float], align 4, addrspace(5)
558  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
559  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
560  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
561  store volatile i32 15, i32 addrspace(5)* %i8, align 4
562  %i9 = and i32 %idx, 15
563  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
564  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
565  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
566  ret void
567}
568
569define void @private_ptr_foo(float addrspace(5)* nocapture %arg) {
570; GFX9-LABEL: private_ptr_foo:
571; GFX9:       ; %bb.0:
572; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
573; GFX9-NEXT:    v_mov_b32_e32 v1, 0x41200000
574; GFX9-NEXT:    scratch_store_dword v0, v1, off offset:4
575; GFX9-NEXT:    s_waitcnt vmcnt(0)
576; GFX9-NEXT:    s_setpc_b64 s[30:31]
577;
578; GFX10-LABEL: private_ptr_foo:
579; GFX10:       ; %bb.0:
580; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
581; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
582; GFX10-NEXT:    v_mov_b32_e32 v1, 0x41200000
583; GFX10-NEXT:    scratch_store_dword v0, v1, off offset:4
584; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
585; GFX10-NEXT:    s_setpc_b64 s[30:31]
586;
587; GFX9-PAL-LABEL: private_ptr_foo:
588; GFX9-PAL:       ; %bb.0:
589; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
590; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 0x41200000
591; GFX9-PAL-NEXT:    scratch_store_dword v0, v1, off offset:4
592; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
593; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
594;
595; GFX10-PAL-LABEL: private_ptr_foo:
596; GFX10-PAL:       ; %bb.0:
597; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
598; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
599; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 0x41200000
600; GFX10-PAL-NEXT:    scratch_store_dword v0, v1, off offset:4
601; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
602; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
603  %gep = getelementptr inbounds float, float addrspace(5)* %arg, i32 1
604  store float 1.000000e+01, float addrspace(5)* %gep, align 4
605  ret void
606}
607
608define amdgpu_kernel void @zero_init_small_offset_kernel() {
609; GFX9-LABEL: zero_init_small_offset_kernel:
610; GFX9:       ; %bb.0:
611; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
612; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
613; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
614; GFX9-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
615; GFX9-NEXT:    s_waitcnt vmcnt(0)
616; GFX9-NEXT:    s_mov_b32 s0, 0
617; GFX9-NEXT:    s_mov_b32 s1, s0
618; GFX9-NEXT:    s_mov_b32 s2, s0
619; GFX9-NEXT:    s_mov_b32 s3, s0
620; GFX9-NEXT:    v_mov_b32_e32 v0, s0
621; GFX9-NEXT:    v_mov_b32_e32 v1, s1
622; GFX9-NEXT:    v_mov_b32_e32 v2, s2
623; GFX9-NEXT:    v_mov_b32_e32 v3, s3
624; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
625; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:272
626; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
627; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:288
628; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
629; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:304
630; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
631; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:320
632; GFX9-NEXT:    s_endpgm
633;
634; GFX10-LABEL: zero_init_small_offset_kernel:
635; GFX10:       ; %bb.0:
636; GFX10-NEXT:    s_add_u32 s0, s0, s3
637; GFX10-NEXT:    s_addc_u32 s1, s1, 0
638; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
639; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
640; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
641; GFX10-NEXT:    s_waitcnt vmcnt(0)
642; GFX10-NEXT:    s_mov_b32 s0, 0
643; GFX10-NEXT:    s_mov_b32 s1, s0
644; GFX10-NEXT:    s_mov_b32 s2, s0
645; GFX10-NEXT:    s_mov_b32 s3, s0
646; GFX10-NEXT:    v_mov_b32_e32 v0, s0
647; GFX10-NEXT:    v_mov_b32_e32 v1, s1
648; GFX10-NEXT:    v_mov_b32_e32 v2, s2
649; GFX10-NEXT:    v_mov_b32_e32 v3, s3
650; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:272
651; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:288
652; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:304
653; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:320
654; GFX10-NEXT:    s_endpgm
655;
656; GFX9-PAL-LABEL: zero_init_small_offset_kernel:
657; GFX9-PAL:       ; %bb.0:
658; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
659; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
660; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
661; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
662; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
663; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
664; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
665; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
666; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
667; GFX9-PAL-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
668; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
669; GFX9-PAL-NEXT:    s_mov_b32 s1, s0
670; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
671; GFX9-PAL-NEXT:    s_mov_b32 s3, s0
672; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, s0
673; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
674; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
675; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, s3
676; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
677; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:272
678; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
679; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:288
680; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
681; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:304
682; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
683; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:320
684; GFX9-PAL-NEXT:    s_endpgm
685;
686; GFX1010-PAL-LABEL: zero_init_small_offset_kernel:
687; GFX1010-PAL:       ; %bb.0:
688; GFX1010-PAL-NEXT:    s_getpc_b64 s[2:3]
689; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
690; GFX1010-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
691; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
692; GFX1010-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
693; GFX1010-PAL-NEXT:    s_add_u32 s2, s2, s1
694; GFX1010-PAL-NEXT:    s_addc_u32 s3, s3, 0
695; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
696; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
697; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
698; GFX1010-PAL-NEXT:    s_mov_b32 s0, 0
699; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, vcc_lo offset:4 glc dlc
700; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
701; GFX1010-PAL-NEXT:    s_mov_b32 s1, s0
702; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
703; GFX1010-PAL-NEXT:    s_mov_b32 s3, s0
704; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, s0
705; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, s1
706; GFX1010-PAL-NEXT:    v_mov_b32_e32 v2, s2
707; GFX1010-PAL-NEXT:    v_mov_b32_e32 v3, s3
708; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
709; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:272
710; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
711; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
712; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:288
713; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
714; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
715; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:304
716; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
717; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
718; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:320
719; GFX1010-PAL-NEXT:    s_endpgm
720;
721; GFX1030-PAL-LABEL: zero_init_small_offset_kernel:
722; GFX1030-PAL:       ; %bb.0:
723; GFX1030-PAL-NEXT:    s_getpc_b64 s[2:3]
724; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
725; GFX1030-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
726; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
727; GFX1030-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
728; GFX1030-PAL-NEXT:    s_add_u32 s2, s2, s1
729; GFX1030-PAL-NEXT:    s_addc_u32 s3, s3, 0
730; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
731; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
732; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
733; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
734; GFX1030-PAL-NEXT:    s_mov_b32 s0, 0
735; GFX1030-PAL-NEXT:    s_mov_b32 s1, s0
736; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
737; GFX1030-PAL-NEXT:    s_mov_b32 s3, s0
738; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, s0
739; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, s1
740; GFX1030-PAL-NEXT:    v_mov_b32_e32 v2, s2
741; GFX1030-PAL-NEXT:    v_mov_b32_e32 v3, s3
742; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:272
743; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:288
744; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:304
745; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:320
746; GFX1030-PAL-NEXT:    s_endpgm
747  %padding = alloca [64 x i32], align 4, addrspace(5)
748  %alloca = alloca [32 x i16], align 2, addrspace(5)
749  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
750  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
751  %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
752  call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
753  ret void
754}
755
756define void @zero_init_small_offset_foo() {
757; GFX9-LABEL: zero_init_small_offset_foo:
758; GFX9:       ; %bb.0:
759; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
760; GFX9-NEXT:    scratch_load_dword v0, off, s32 glc
761; GFX9-NEXT:    s_waitcnt vmcnt(0)
762; GFX9-NEXT:    s_mov_b32 s0, 0
763; GFX9-NEXT:    s_mov_b32 s1, s0
764; GFX9-NEXT:    s_mov_b32 s2, s0
765; GFX9-NEXT:    s_mov_b32 s3, s0
766; GFX9-NEXT:    v_mov_b32_e32 v0, s0
767; GFX9-NEXT:    v_mov_b32_e32 v1, s1
768; GFX9-NEXT:    v_mov_b32_e32 v2, s2
769; GFX9-NEXT:    v_mov_b32_e32 v3, s3
770; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:256
771; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:272
772; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:288
773; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:304
774; GFX9-NEXT:    s_waitcnt vmcnt(0)
775; GFX9-NEXT:    s_setpc_b64 s[30:31]
776;
777; GFX10-LABEL: zero_init_small_offset_foo:
778; GFX10:       ; %bb.0:
779; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
780; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
781; GFX10-NEXT:    scratch_load_dword v0, off, s32 glc dlc
782; GFX10-NEXT:    s_waitcnt vmcnt(0)
783; GFX10-NEXT:    s_mov_b32 s0, 0
784; GFX10-NEXT:    s_mov_b32 s1, s0
785; GFX10-NEXT:    s_mov_b32 s2, s0
786; GFX10-NEXT:    s_mov_b32 s3, s0
787; GFX10-NEXT:    v_mov_b32_e32 v0, s0
788; GFX10-NEXT:    v_mov_b32_e32 v1, s1
789; GFX10-NEXT:    v_mov_b32_e32 v2, s2
790; GFX10-NEXT:    v_mov_b32_e32 v3, s3
791; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:256
792; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:272
793; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:288
794; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:304
795; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
796; GFX10-NEXT:    s_setpc_b64 s[30:31]
797;
798; GFX9-PAL-LABEL: zero_init_small_offset_foo:
799; GFX9-PAL:       ; %bb.0:
800; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
801; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s32 glc
802; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
803; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
804; GFX9-PAL-NEXT:    s_mov_b32 s1, s0
805; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
806; GFX9-PAL-NEXT:    s_mov_b32 s3, s0
807; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, s0
808; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
809; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
810; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, s3
811; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:256
812; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:272
813; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:288
814; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:304
815; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
816; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
817;
818; GFX10-PAL-LABEL: zero_init_small_offset_foo:
819; GFX10-PAL:       ; %bb.0:
820; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
821; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
822; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s32 glc dlc
823; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
824; GFX10-PAL-NEXT:    s_mov_b32 s0, 0
825; GFX10-PAL-NEXT:    s_mov_b32 s1, s0
826; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
827; GFX10-PAL-NEXT:    s_mov_b32 s3, s0
828; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, s0
829; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, s1
830; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, s2
831; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, s3
832; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:256
833; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:272
834; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:288
835; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:304
836; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
837; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
838  %padding = alloca [64 x i32], align 4, addrspace(5)
839  %alloca = alloca [32 x i16], align 2, addrspace(5)
840  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
841  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
842  %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
843  call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
844  ret void
845}
846
847define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
848; GFX9-LABEL: store_load_sindex_small_offset_kernel:
849; GFX9:       ; %bb.0: ; %bb
850; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
851; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
852; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
853; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
854; GFX9-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
855; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
856; GFX9-NEXT:    s_lshl_b32 s1, s0, 2
857; GFX9-NEXT:    s_and_b32 s0, s0, 15
858; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
859; GFX9-NEXT:    v_mov_b32_e32 v0, 15
860; GFX9-NEXT:    s_addk_i32 s1, 0x104
861; GFX9-NEXT:    scratch_store_dword off, v0, s1
862; GFX9-NEXT:    s_waitcnt vmcnt(0)
863; GFX9-NEXT:    s_addk_i32 s0, 0x104
864; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
865; GFX9-NEXT:    s_waitcnt vmcnt(0)
866; GFX9-NEXT:    s_endpgm
867;
868; GFX10-LABEL: store_load_sindex_small_offset_kernel:
869; GFX10:       ; %bb.0: ; %bb
870; GFX10-NEXT:    s_add_u32 s2, s2, s5
871; GFX10-NEXT:    s_addc_u32 s3, s3, 0
872; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
873; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
874; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x24
875; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
876; GFX10-NEXT:    s_waitcnt vmcnt(0)
877; GFX10-NEXT:    v_mov_b32_e32 v0, 15
878; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
879; GFX10-NEXT:    s_and_b32 s1, s0, 15
880; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
881; GFX10-NEXT:    s_lshl_b32 s1, s1, 2
882; GFX10-NEXT:    s_addk_i32 s0, 0x104
883; GFX10-NEXT:    s_addk_i32 s1, 0x104
884; GFX10-NEXT:    scratch_store_dword off, v0, s0
885; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
886; GFX10-NEXT:    scratch_load_dword v0, off, s1 glc dlc
887; GFX10-NEXT:    s_waitcnt vmcnt(0)
888; GFX10-NEXT:    s_endpgm
889;
890; GFX9-PAL-LABEL: store_load_sindex_small_offset_kernel:
891; GFX9-PAL:       ; %bb.0: ; %bb
892; GFX9-PAL-NEXT:    s_getpc_b64 s[4:5]
893; GFX9-PAL-NEXT:    s_mov_b32 s4, s0
894; GFX9-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
895; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
896; GFX9-PAL-NEXT:    s_load_dword s0, s[0:1], 0x24
897; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
898; GFX9-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
899; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s4, s3
900; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
901; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
902; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
903; GFX9-PAL-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
904; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
905; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
906; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
907; GFX9-PAL-NEXT:    s_addk_i32 s1, 0x104
908; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
909; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
910; GFX9-PAL-NEXT:    s_addk_i32 s0, 0x104
911; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 glc
912; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
913; GFX9-PAL-NEXT:    s_endpgm
914;
915; GFX1010-PAL-LABEL: store_load_sindex_small_offset_kernel:
916; GFX1010-PAL:       ; %bb.0: ; %bb
917; GFX1010-PAL-NEXT:    s_getpc_b64 s[4:5]
918; GFX1010-PAL-NEXT:    s_mov_b32 s4, s0
919; GFX1010-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
920; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
921; GFX1010-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
922; GFX1010-PAL-NEXT:    s_add_u32 s4, s4, s3
923; GFX1010-PAL-NEXT:    s_addc_u32 s5, s5, 0
924; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
925; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
926; GFX1010-PAL-NEXT:    s_load_dword s0, s[0:1], 0x24
927; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
928; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, vcc_lo offset:4 glc dlc
929; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
930; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, 15
931; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
932; GFX1010-PAL-NEXT:    s_and_b32 s1, s0, 15
933; GFX1010-PAL-NEXT:    s_lshl_b32 s0, s0, 2
934; GFX1010-PAL-NEXT:    s_lshl_b32 s1, s1, 2
935; GFX1010-PAL-NEXT:    s_addk_i32 s0, 0x104
936; GFX1010-PAL-NEXT:    s_addk_i32 s1, 0x104
937; GFX1010-PAL-NEXT:    scratch_store_dword off, v0, s0
938; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
939; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
940; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
941; GFX1010-PAL-NEXT:    s_endpgm
942;
943; GFX1030-PAL-LABEL: store_load_sindex_small_offset_kernel:
944; GFX1030-PAL:       ; %bb.0: ; %bb
945; GFX1030-PAL-NEXT:    s_getpc_b64 s[4:5]
946; GFX1030-PAL-NEXT:    s_mov_b32 s4, s0
947; GFX1030-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
948; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
949; GFX1030-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
950; GFX1030-PAL-NEXT:    s_add_u32 s4, s4, s3
951; GFX1030-PAL-NEXT:    s_addc_u32 s5, s5, 0
952; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
953; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
954; GFX1030-PAL-NEXT:    s_load_dword s0, s[0:1], 0x24
955; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
956; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
957; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, 15
958; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
959; GFX1030-PAL-NEXT:    s_and_b32 s1, s0, 15
960; GFX1030-PAL-NEXT:    s_lshl_b32 s0, s0, 2
961; GFX1030-PAL-NEXT:    s_lshl_b32 s1, s1, 2
962; GFX1030-PAL-NEXT:    s_addk_i32 s0, 0x104
963; GFX1030-PAL-NEXT:    s_addk_i32 s1, 0x104
964; GFX1030-PAL-NEXT:    scratch_store_dword off, v0, s0
965; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
966; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
967; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
968; GFX1030-PAL-NEXT:    s_endpgm
969bb:
970  %padding = alloca [64 x i32], align 4, addrspace(5)
971  %i = alloca [32 x float], align 4, addrspace(5)
972  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
973  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
974  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
975  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
976  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
977  store volatile i32 15, i32 addrspace(5)* %i8, align 4
978  %i9 = and i32 %idx, 15
979  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
980  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
981  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
982  ret void
983}
984
985define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) {
986; GFX9-LABEL: store_load_sindex_small_offset_foo:
987; GFX9:       ; %bb.0: ; %bb
988; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
989; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
990; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
991; GFX9-NEXT:    s_lshl_b32 s0, s2, 2
992; GFX9-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
993; GFX9-NEXT:    s_waitcnt vmcnt(0)
994; GFX9-NEXT:    s_addk_i32 s0, 0x104
995; GFX9-NEXT:    v_mov_b32_e32 v0, 15
996; GFX9-NEXT:    scratch_store_dword off, v0, s0
997; GFX9-NEXT:    s_waitcnt vmcnt(0)
998; GFX9-NEXT:    s_and_b32 s0, s2, 15
999; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
1000; GFX9-NEXT:    s_addk_i32 s0, 0x104
1001; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
1002; GFX9-NEXT:    s_waitcnt vmcnt(0)
1003; GFX9-NEXT:    s_endpgm
1004;
1005; GFX10-LABEL: store_load_sindex_small_offset_foo:
1006; GFX10:       ; %bb.0: ; %bb
1007; GFX10-NEXT:    s_add_u32 s0, s0, s3
1008; GFX10-NEXT:    s_addc_u32 s1, s1, 0
1009; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1010; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1011; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
1012; GFX10-NEXT:    s_waitcnt vmcnt(0)
1013; GFX10-NEXT:    s_and_b32 s0, s2, 15
1014; GFX10-NEXT:    v_mov_b32_e32 v0, 15
1015; GFX10-NEXT:    s_lshl_b32 s1, s2, 2
1016; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
1017; GFX10-NEXT:    s_addk_i32 s1, 0x104
1018; GFX10-NEXT:    s_addk_i32 s0, 0x104
1019; GFX10-NEXT:    scratch_store_dword off, v0, s1
1020; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1021; GFX10-NEXT:    scratch_load_dword v0, off, s0 glc dlc
1022; GFX10-NEXT:    s_waitcnt vmcnt(0)
1023; GFX10-NEXT:    s_endpgm
1024;
1025; GFX9-PAL-LABEL: store_load_sindex_small_offset_foo:
1026; GFX9-PAL:       ; %bb.0: ; %bb
1027; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
1028; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
1029; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1030; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
1031; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1032; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1033; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
1034; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
1035; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
1036; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
1037; GFX9-PAL-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
1038; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1039; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
1040; GFX9-PAL-NEXT:    s_addk_i32 s1, 0x104
1041; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
1042; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
1043; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1044; GFX9-PAL-NEXT:    s_addk_i32 s0, 0x104
1045; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 glc
1046; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1047; GFX9-PAL-NEXT:    s_endpgm
1048;
1049; GFX1010-PAL-LABEL: store_load_sindex_small_offset_foo:
1050; GFX1010-PAL:       ; %bb.0: ; %bb
1051; GFX1010-PAL-NEXT:    s_getpc_b64 s[2:3]
1052; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
1053; GFX1010-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1054; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1055; GFX1010-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1056; GFX1010-PAL-NEXT:    s_add_u32 s2, s2, s1
1057; GFX1010-PAL-NEXT:    s_addc_u32 s3, s3, 0
1058; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1059; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1060; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
1061; GFX1010-PAL-NEXT:    s_and_b32 s1, s0, 15
1062; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, vcc_lo offset:4 glc dlc
1063; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
1064; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, 15
1065; GFX1010-PAL-NEXT:    s_lshl_b32 s0, s0, 2
1066; GFX1010-PAL-NEXT:    s_lshl_b32 s1, s1, 2
1067; GFX1010-PAL-NEXT:    s_addk_i32 s0, 0x104
1068; GFX1010-PAL-NEXT:    s_addk_i32 s1, 0x104
1069; GFX1010-PAL-NEXT:    scratch_store_dword off, v0, s0
1070; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1071; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
1072; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
1073; GFX1010-PAL-NEXT:    s_endpgm
1074;
1075; GFX1030-PAL-LABEL: store_load_sindex_small_offset_foo:
1076; GFX1030-PAL:       ; %bb.0: ; %bb
1077; GFX1030-PAL-NEXT:    s_getpc_b64 s[2:3]
1078; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
1079; GFX1030-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1080; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1081; GFX1030-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1082; GFX1030-PAL-NEXT:    s_add_u32 s2, s2, s1
1083; GFX1030-PAL-NEXT:    s_addc_u32 s3, s3, 0
1084; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1085; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1086; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
1087; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
1088; GFX1030-PAL-NEXT:    s_and_b32 s1, s0, 15
1089; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, 15
1090; GFX1030-PAL-NEXT:    s_lshl_b32 s0, s0, 2
1091; GFX1030-PAL-NEXT:    s_lshl_b32 s1, s1, 2
1092; GFX1030-PAL-NEXT:    s_addk_i32 s0, 0x104
1093; GFX1030-PAL-NEXT:    s_addk_i32 s1, 0x104
1094; GFX1030-PAL-NEXT:    scratch_store_dword off, v0, s0
1095; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1096; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
1097; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
1098; GFX1030-PAL-NEXT:    s_endpgm
1099bb:
1100  %padding = alloca [64 x i32], align 4, addrspace(5)
1101  %i = alloca [32 x float], align 4, addrspace(5)
1102  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
1103  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
1104  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
1105  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
1106  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
1107  store volatile i32 15, i32 addrspace(5)* %i8, align 4
1108  %i9 = and i32 %idx, 15
1109  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
1110  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
1111  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
1112  ret void
1113}
1114
1115define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
1116; GFX9-LABEL: store_load_vindex_small_offset_kernel:
1117; GFX9:       ; %bb.0: ; %bb
1118; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
1119; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
1120; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
1121; GFX9-NEXT:    scratch_load_dword v1, off, vcc_hi offset:4 glc
1122; GFX9-NEXT:    s_waitcnt vmcnt(0)
1123; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1124; GFX9-NEXT:    v_mov_b32_e32 v1, 0x104
1125; GFX9-NEXT:    v_add_u32_e32 v2, v1, v0
1126; GFX9-NEXT:    v_mov_b32_e32 v3, 15
1127; GFX9-NEXT:    scratch_store_dword v2, v3, off
1128; GFX9-NEXT:    s_waitcnt vmcnt(0)
1129; GFX9-NEXT:    v_sub_u32_e32 v0, v1, v0
1130; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
1131; GFX9-NEXT:    s_waitcnt vmcnt(0)
1132; GFX9-NEXT:    s_endpgm
1133;
1134; GFX10-LABEL: store_load_vindex_small_offset_kernel:
1135; GFX10:       ; %bb.0: ; %bb
1136; GFX10-NEXT:    s_add_u32 s0, s0, s3
1137; GFX10-NEXT:    s_addc_u32 s1, s1, 0
1138; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1139; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1140; GFX10-NEXT:    v_mov_b32_e32 v1, 0x104
1141; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1142; GFX10-NEXT:    v_mov_b32_e32 v3, 15
1143; GFX10-NEXT:    v_add_nc_u32_e32 v2, v1, v0
1144; GFX10-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
1145; GFX10-NEXT:    scratch_load_dword v1, off, off offset:4 glc dlc
1146; GFX10-NEXT:    s_waitcnt vmcnt(0)
1147; GFX10-NEXT:    scratch_store_dword v2, v3, off
1148; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1149; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
1150; GFX10-NEXT:    s_waitcnt vmcnt(0)
1151; GFX10-NEXT:    s_endpgm
1152;
1153; GFX9-PAL-LABEL: store_load_vindex_small_offset_kernel:
1154; GFX9-PAL:       ; %bb.0: ; %bb
1155; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
1156; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
1157; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1158; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
1159; GFX9-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1160; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 15
1161; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1162; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1163; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
1164; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
1165; GFX9-PAL-NEXT:    scratch_load_dword v1, off, vcc_hi offset:4 glc
1166; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1167; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 0x104
1168; GFX9-PAL-NEXT:    v_add_u32_e32 v2, v1, v0
1169; GFX9-PAL-NEXT:    scratch_store_dword v2, v3, off
1170; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1171; GFX9-PAL-NEXT:    v_sub_u32_e32 v0, v1, v0
1172; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
1173; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1174; GFX9-PAL-NEXT:    s_endpgm
1175;
1176; GFX1010-PAL-LABEL: store_load_vindex_small_offset_kernel:
1177; GFX1010-PAL:       ; %bb.0: ; %bb
1178; GFX1010-PAL-NEXT:    s_getpc_b64 s[2:3]
1179; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
1180; GFX1010-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1181; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1182; GFX1010-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1183; GFX1010-PAL-NEXT:    s_add_u32 s2, s2, s1
1184; GFX1010-PAL-NEXT:    s_addc_u32 s3, s3, 0
1185; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1186; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1187; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, 0x104
1188; GFX1010-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1189; GFX1010-PAL-NEXT:    v_mov_b32_e32 v3, 15
1190; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
1191; GFX1010-PAL-NEXT:    v_add_nc_u32_e32 v2, v1, v0
1192; GFX1010-PAL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
1193; GFX1010-PAL-NEXT:    scratch_load_dword v1, off, vcc_lo offset:4 glc dlc
1194; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
1195; GFX1010-PAL-NEXT:    scratch_store_dword v2, v3, off
1196; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1197; GFX1010-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
1198; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
1199; GFX1010-PAL-NEXT:    s_endpgm
1200;
1201; GFX1030-PAL-LABEL: store_load_vindex_small_offset_kernel:
1202; GFX1030-PAL:       ; %bb.0: ; %bb
1203; GFX1030-PAL-NEXT:    s_getpc_b64 s[2:3]
1204; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
1205; GFX1030-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1206; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1207; GFX1030-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1208; GFX1030-PAL-NEXT:    s_add_u32 s2, s2, s1
1209; GFX1030-PAL-NEXT:    s_addc_u32 s3, s3, 0
1210; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1211; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1212; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, 0x104
1213; GFX1030-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1214; GFX1030-PAL-NEXT:    v_mov_b32_e32 v3, 15
1215; GFX1030-PAL-NEXT:    v_add_nc_u32_e32 v2, v1, v0
1216; GFX1030-PAL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
1217; GFX1030-PAL-NEXT:    scratch_load_dword v1, off, off offset:4 glc dlc
1218; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
1219; GFX1030-PAL-NEXT:    scratch_store_dword v2, v3, off
1220; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1221; GFX1030-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
1222; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
1223; GFX1030-PAL-NEXT:    s_endpgm
1224bb:
1225  %padding = alloca [64 x i32], align 4, addrspace(5)
1226  %i = alloca [32 x float], align 4, addrspace(5)
1227  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
1228  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
1229  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
1230  %i2 = tail call i32 @llvm.amdgcn.workitem.id.x()
1231  %i3 = zext i32 %i2 to i64
1232  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2
1233  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
1234  store volatile i32 15, i32 addrspace(5)* %i8, align 4
1235  %i9 = sub nsw i32 31, %i2
1236  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
1237  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
1238  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
1239  ret void
1240}
1241
1242define void @store_load_vindex_small_offset_foo(i32 %idx) {
1243; GFX9-LABEL: store_load_vindex_small_offset_foo:
1244; GFX9:       ; %bb.0: ; %bb
1245; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1246; GFX9-NEXT:    scratch_load_dword v1, off, s32 glc
1247; GFX9-NEXT:    s_waitcnt vmcnt(0)
1248; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 0x100
1249; GFX9-NEXT:    v_mov_b32_e32 v1, vcc_hi
1250; GFX9-NEXT:    v_mov_b32_e32 v3, 15
1251; GFX9-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
1252; GFX9-NEXT:    v_and_b32_e32 v0, v0, v3
1253; GFX9-NEXT:    scratch_store_dword v2, v3, off
1254; GFX9-NEXT:    s_waitcnt vmcnt(0)
1255; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
1256; GFX9-NEXT:    scratch_load_dword v0, v0, off glc
1257; GFX9-NEXT:    s_waitcnt vmcnt(0)
1258; GFX9-NEXT:    s_setpc_b64 s[30:31]
1259;
1260; GFX10-LABEL: store_load_vindex_small_offset_foo:
1261; GFX10:       ; %bb.0: ; %bb
1262; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1263; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1264; GFX10-NEXT:    v_mov_b32_e32 v1, 15
1265; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x100
1266; GFX10-NEXT:    v_mov_b32_e32 v2, vcc_lo
1267; GFX10-NEXT:    v_and_b32_e32 v3, v0, v1
1268; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, v2
1269; GFX10-NEXT:    v_lshl_add_u32 v2, v3, 2, v2
1270; GFX10-NEXT:    scratch_load_dword v3, off, s32 glc dlc
1271; GFX10-NEXT:    s_waitcnt vmcnt(0)
1272; GFX10-NEXT:    scratch_store_dword v0, v1, off
1273; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1274; GFX10-NEXT:    scratch_load_dword v0, v2, off glc dlc
1275; GFX10-NEXT:    s_waitcnt vmcnt(0)
1276; GFX10-NEXT:    s_setpc_b64 s[30:31]
1277;
1278; GFX9-PAL-LABEL: store_load_vindex_small_offset_foo:
1279; GFX9-PAL:       ; %bb.0: ; %bb
1280; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1281; GFX9-PAL-NEXT:    scratch_load_dword v1, off, s32 glc
1282; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1283; GFX9-PAL-NEXT:    s_add_i32 vcc_hi, s32, 0x100
1284; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, vcc_hi
1285; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 15
1286; GFX9-PAL-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
1287; GFX9-PAL-NEXT:    v_and_b32_e32 v0, v0, v3
1288; GFX9-PAL-NEXT:    scratch_store_dword v2, v3, off
1289; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1290; GFX9-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
1291; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off glc
1292; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1293; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
1294;
1295; GFX10-PAL-LABEL: store_load_vindex_small_offset_foo:
1296; GFX10-PAL:       ; %bb.0: ; %bb
1297; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1298; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1299; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
1300; GFX10-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x100
1301; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, vcc_lo
1302; GFX10-PAL-NEXT:    v_and_b32_e32 v3, v0, v1
1303; GFX10-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v2
1304; GFX10-PAL-NEXT:    v_lshl_add_u32 v2, v3, 2, v2
1305; GFX10-PAL-NEXT:    scratch_load_dword v3, off, s32 glc dlc
1306; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
1307; GFX10-PAL-NEXT:    scratch_store_dword v0, v1, off
1308; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1309; GFX10-PAL-NEXT:    scratch_load_dword v0, v2, off glc dlc
1310; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
1311; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
1312bb:
1313  %padding = alloca [64 x i32], align 4, addrspace(5)
1314  %i = alloca [32 x float], align 4, addrspace(5)
1315  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
1316  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
1317  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
1318  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
1319  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
1320  store volatile i32 15, i32 addrspace(5)* %i8, align 4
1321  %i9 = and i32 %idx, 15
1322  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
1323  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
1324  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
1325  ret void
1326}
1327
1328define amdgpu_kernel void @zero_init_large_offset_kernel() {
1329; GFX9-LABEL: zero_init_large_offset_kernel:
1330; GFX9:       ; %bb.0:
1331; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
1332; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
1333; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
1334; GFX9-NEXT:    scratch_load_dword v0, off, vcc_hi offset:16 glc
1335; GFX9-NEXT:    s_waitcnt vmcnt(0)
1336; GFX9-NEXT:    s_mov_b32 s0, 0
1337; GFX9-NEXT:    s_mov_b32 s1, s0
1338; GFX9-NEXT:    s_mov_b32 s2, s0
1339; GFX9-NEXT:    s_mov_b32 s3, s0
1340; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1341; GFX9-NEXT:    v_mov_b32_e32 v1, s1
1342; GFX9-NEXT:    v_mov_b32_e32 v2, s2
1343; GFX9-NEXT:    v_mov_b32_e32 v3, s3
1344; GFX9-NEXT:    s_movk_i32 vcc_hi, 0x4010
1345; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi
1346; GFX9-NEXT:    s_movk_i32 vcc_hi, 0x4010
1347; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
1348; GFX9-NEXT:    s_movk_i32 vcc_hi, 0x4010
1349; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32
1350; GFX9-NEXT:    s_movk_i32 vcc_hi, 0x4010
1351; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
1352; GFX9-NEXT:    s_endpgm
1353;
1354; GFX10-LABEL: zero_init_large_offset_kernel:
1355; GFX10:       ; %bb.0:
1356; GFX10-NEXT:    s_add_u32 s0, s0, s3
1357; GFX10-NEXT:    s_addc_u32 s1, s1, 0
1358; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1359; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1360; GFX10-NEXT:    scratch_load_dword v0, off, off offset:16 glc dlc
1361; GFX10-NEXT:    s_waitcnt vmcnt(0)
1362; GFX10-NEXT:    s_mov_b32 s0, 0
1363; GFX10-NEXT:    s_movk_i32 vcc_lo, 0x4010
1364; GFX10-NEXT:    s_mov_b32 s1, s0
1365; GFX10-NEXT:    s_mov_b32 s2, s0
1366; GFX10-NEXT:    s_mov_b32 s3, s0
1367; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1368; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1369; GFX10-NEXT:    v_mov_b32_e32 v2, s2
1370; GFX10-NEXT:    v_mov_b32_e32 v3, s3
1371; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo
1372; GFX10-NEXT:    s_movk_i32 vcc_lo, 0x4010
1373; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
1374; GFX10-NEXT:    s_movk_i32 vcc_lo, 0x4010
1375; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
1376; GFX10-NEXT:    s_movk_i32 vcc_lo, 0x4010
1377; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
1378; GFX10-NEXT:    s_endpgm
1379;
1380; GFX9-PAL-LABEL: zero_init_large_offset_kernel:
1381; GFX9-PAL:       ; %bb.0:
1382; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
1383; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
1384; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1385; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
1386; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
1387; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1388; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1389; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
1390; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
1391; GFX9-PAL-NEXT:    scratch_load_dword v0, off, vcc_hi offset:16 glc
1392; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1393; GFX9-PAL-NEXT:    s_mov_b32 s1, s0
1394; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
1395; GFX9-PAL-NEXT:    s_mov_b32 s3, s0
1396; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, s0
1397; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
1398; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
1399; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, s3
1400; GFX9-PAL-NEXT:    s_movk_i32 vcc_hi, 0x4010
1401; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi
1402; GFX9-PAL-NEXT:    s_movk_i32 vcc_hi, 0x4010
1403; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
1404; GFX9-PAL-NEXT:    s_movk_i32 vcc_hi, 0x4010
1405; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32
1406; GFX9-PAL-NEXT:    s_movk_i32 vcc_hi, 0x4010
1407; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
1408; GFX9-PAL-NEXT:    s_endpgm
1409;
1410; GFX1010-PAL-LABEL: zero_init_large_offset_kernel:
1411; GFX1010-PAL:       ; %bb.0:
1412; GFX1010-PAL-NEXT:    s_getpc_b64 s[2:3]
1413; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
1414; GFX1010-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1415; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1416; GFX1010-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1417; GFX1010-PAL-NEXT:    s_add_u32 s2, s2, s1
1418; GFX1010-PAL-NEXT:    s_addc_u32 s3, s3, 0
1419; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1420; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1421; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
1422; GFX1010-PAL-NEXT:    s_mov_b32 s0, 0
1423; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, vcc_lo offset:16 glc dlc
1424; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
1425; GFX1010-PAL-NEXT:    s_mov_b32 s1, s0
1426; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
1427; GFX1010-PAL-NEXT:    s_mov_b32 s3, s0
1428; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, s0
1429; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, s1
1430; GFX1010-PAL-NEXT:    v_mov_b32_e32 v2, s2
1431; GFX1010-PAL-NEXT:    v_mov_b32_e32 v3, s3
1432; GFX1010-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
1433; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo
1434; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
1435; GFX1010-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
1436; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
1437; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
1438; GFX1010-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
1439; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
1440; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
1441; GFX1010-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
1442; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
1443; GFX1010-PAL-NEXT:    s_endpgm
1444;
1445; GFX1030-PAL-LABEL: zero_init_large_offset_kernel:
1446; GFX1030-PAL:       ; %bb.0:
1447; GFX1030-PAL-NEXT:    s_getpc_b64 s[2:3]
1448; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
1449; GFX1030-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1450; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1451; GFX1030-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1452; GFX1030-PAL-NEXT:    s_add_u32 s2, s2, s1
1453; GFX1030-PAL-NEXT:    s_addc_u32 s3, s3, 0
1454; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1455; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1456; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, off offset:16 glc dlc
1457; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
1458; GFX1030-PAL-NEXT:    s_mov_b32 s0, 0
1459; GFX1030-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
1460; GFX1030-PAL-NEXT:    s_mov_b32 s1, s0
1461; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
1462; GFX1030-PAL-NEXT:    s_mov_b32 s3, s0
1463; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, s0
1464; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, s1
1465; GFX1030-PAL-NEXT:    v_mov_b32_e32 v2, s2
1466; GFX1030-PAL-NEXT:    v_mov_b32_e32 v3, s3
1467; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo
1468; GFX1030-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
1469; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
1470; GFX1030-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
1471; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
1472; GFX1030-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
1473; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
1474; GFX1030-PAL-NEXT:    s_endpgm
1475  %padding = alloca [4096 x i32], align 4, addrspace(5)
1476  %alloca = alloca [32 x i16], align 2, addrspace(5)
1477  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
1478  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
1479  %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
1480  call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
1481  ret void
1482}
1483
1484define void @zero_init_large_offset_foo() {
1485; GFX9-LABEL: zero_init_large_offset_foo:
1486; GFX9:       ; %bb.0:
1487; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1488; GFX9-NEXT:    scratch_load_dword v0, off, s32 glc
1489; GFX9-NEXT:    s_waitcnt vmcnt(0)
1490; GFX9-NEXT:    s_mov_b32 s0, 0
1491; GFX9-NEXT:    s_mov_b32 s1, s0
1492; GFX9-NEXT:    s_mov_b32 s2, s0
1493; GFX9-NEXT:    s_mov_b32 s3, s0
1494; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1495; GFX9-NEXT:    v_mov_b32_e32 v1, s1
1496; GFX9-NEXT:    v_mov_b32_e32 v2, s2
1497; GFX9-NEXT:    v_mov_b32_e32 v3, s3
1498; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 0x4000
1499; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi
1500; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 0x4000
1501; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
1502; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 0x4000
1503; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32
1504; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 0x4000
1505; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
1506; GFX9-NEXT:    s_waitcnt vmcnt(0)
1507; GFX9-NEXT:    s_setpc_b64 s[30:31]
1508;
1509; GFX10-LABEL: zero_init_large_offset_foo:
1510; GFX10:       ; %bb.0:
1511; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1512; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1513; GFX10-NEXT:    scratch_load_dword v0, off, s32 glc dlc
1514; GFX10-NEXT:    s_waitcnt vmcnt(0)
1515; GFX10-NEXT:    s_mov_b32 s0, 0
1516; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x4000
1517; GFX10-NEXT:    s_mov_b32 s1, s0
1518; GFX10-NEXT:    s_mov_b32 s2, s0
1519; GFX10-NEXT:    s_mov_b32 s3, s0
1520; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1521; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1522; GFX10-NEXT:    v_mov_b32_e32 v2, s2
1523; GFX10-NEXT:    v_mov_b32_e32 v3, s3
1524; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo
1525; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x4000
1526; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
1527; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x4000
1528; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
1529; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x4000
1530; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
1531; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1532; GFX10-NEXT:    s_setpc_b64 s[30:31]
1533;
1534; GFX9-PAL-LABEL: zero_init_large_offset_foo:
1535; GFX9-PAL:       ; %bb.0:
1536; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1537; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s32 glc
1538; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1539; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
1540; GFX9-PAL-NEXT:    s_mov_b32 s1, s0
1541; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
1542; GFX9-PAL-NEXT:    s_mov_b32 s3, s0
1543; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, s0
1544; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
1545; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
1546; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, s3
1547; GFX9-PAL-NEXT:    s_add_i32 vcc_hi, s32, 0x4000
1548; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi
1549; GFX9-PAL-NEXT:    s_add_i32 vcc_hi, s32, 0x4000
1550; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
1551; GFX9-PAL-NEXT:    s_add_i32 vcc_hi, s32, 0x4000
1552; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32
1553; GFX9-PAL-NEXT:    s_add_i32 vcc_hi, s32, 0x4000
1554; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
1555; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1556; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
1557;
1558; GFX1010-PAL-LABEL: zero_init_large_offset_foo:
1559; GFX1010-PAL:       ; %bb.0:
1560; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1561; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1562; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s32 glc dlc
1563; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
1564; GFX1010-PAL-NEXT:    s_mov_b32 s0, 0
1565; GFX1010-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4000
1566; GFX1010-PAL-NEXT:    s_mov_b32 s1, s0
1567; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
1568; GFX1010-PAL-NEXT:    s_mov_b32 s3, s0
1569; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, s0
1570; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, s1
1571; GFX1010-PAL-NEXT:    v_mov_b32_e32 v2, s2
1572; GFX1010-PAL-NEXT:    v_mov_b32_e32 v3, s3
1573; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo
1574; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
1575; GFX1010-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4000
1576; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
1577; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
1578; GFX1010-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4000
1579; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
1580; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
1581; GFX1010-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4000
1582; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
1583; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1584; GFX1010-PAL-NEXT:    s_setpc_b64 s[30:31]
1585;
1586; GFX1030-PAL-LABEL: zero_init_large_offset_foo:
1587; GFX1030-PAL:       ; %bb.0:
1588; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1589; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1590; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, s32 glc dlc
1591; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
1592; GFX1030-PAL-NEXT:    s_mov_b32 s0, 0
1593; GFX1030-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4000
1594; GFX1030-PAL-NEXT:    s_mov_b32 s1, s0
1595; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
1596; GFX1030-PAL-NEXT:    s_mov_b32 s3, s0
1597; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, s0
1598; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, s1
1599; GFX1030-PAL-NEXT:    v_mov_b32_e32 v2, s2
1600; GFX1030-PAL-NEXT:    v_mov_b32_e32 v3, s3
1601; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo
1602; GFX1030-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4000
1603; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
1604; GFX1030-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4000
1605; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
1606; GFX1030-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4000
1607; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
1608; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1609; GFX1030-PAL-NEXT:    s_setpc_b64 s[30:31]
1610  %padding = alloca [4096 x i32], align 4, addrspace(5)
1611  %alloca = alloca [32 x i16], align 2, addrspace(5)
1612  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
1613  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
1614  %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
1615  call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
1616  ret void
1617}
1618
1619define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
1620; GFX9-LABEL: store_load_sindex_large_offset_kernel:
1621; GFX9:       ; %bb.0: ; %bb
1622; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
1623; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
1624; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
1625; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
1626; GFX9-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
1627; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1628; GFX9-NEXT:    s_lshl_b32 s1, s0, 2
1629; GFX9-NEXT:    s_and_b32 s0, s0, 15
1630; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
1631; GFX9-NEXT:    v_mov_b32_e32 v0, 15
1632; GFX9-NEXT:    s_addk_i32 s1, 0x4004
1633; GFX9-NEXT:    scratch_store_dword off, v0, s1
1634; GFX9-NEXT:    s_waitcnt vmcnt(0)
1635; GFX9-NEXT:    s_addk_i32 s0, 0x4004
1636; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
1637; GFX9-NEXT:    s_waitcnt vmcnt(0)
1638; GFX9-NEXT:    s_endpgm
1639;
1640; GFX10-LABEL: store_load_sindex_large_offset_kernel:
1641; GFX10:       ; %bb.0: ; %bb
1642; GFX10-NEXT:    s_add_u32 s2, s2, s5
1643; GFX10-NEXT:    s_addc_u32 s3, s3, 0
1644; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1645; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1646; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x24
1647; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
1648; GFX10-NEXT:    s_waitcnt vmcnt(0)
1649; GFX10-NEXT:    v_mov_b32_e32 v0, 15
1650; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1651; GFX10-NEXT:    s_and_b32 s1, s0, 15
1652; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
1653; GFX10-NEXT:    s_lshl_b32 s1, s1, 2
1654; GFX10-NEXT:    s_addk_i32 s0, 0x4004
1655; GFX10-NEXT:    s_addk_i32 s1, 0x4004
1656; GFX10-NEXT:    scratch_store_dword off, v0, s0
1657; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1658; GFX10-NEXT:    scratch_load_dword v0, off, s1 glc dlc
1659; GFX10-NEXT:    s_waitcnt vmcnt(0)
1660; GFX10-NEXT:    s_endpgm
1661;
1662; GFX9-PAL-LABEL: store_load_sindex_large_offset_kernel:
1663; GFX9-PAL:       ; %bb.0: ; %bb
1664; GFX9-PAL-NEXT:    s_getpc_b64 s[4:5]
1665; GFX9-PAL-NEXT:    s_mov_b32 s4, s0
1666; GFX9-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
1667; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
1668; GFX9-PAL-NEXT:    s_load_dword s0, s[0:1], 0x24
1669; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1670; GFX9-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
1671; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s4, s3
1672; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
1673; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
1674; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
1675; GFX9-PAL-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
1676; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1677; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
1678; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
1679; GFX9-PAL-NEXT:    s_addk_i32 s1, 0x4004
1680; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
1681; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1682; GFX9-PAL-NEXT:    s_addk_i32 s0, 0x4004
1683; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 glc
1684; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1685; GFX9-PAL-NEXT:    s_endpgm
1686;
1687; GFX1010-PAL-LABEL: store_load_sindex_large_offset_kernel:
1688; GFX1010-PAL:       ; %bb.0: ; %bb
1689; GFX1010-PAL-NEXT:    s_getpc_b64 s[4:5]
1690; GFX1010-PAL-NEXT:    s_mov_b32 s4, s0
1691; GFX1010-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
1692; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1693; GFX1010-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
1694; GFX1010-PAL-NEXT:    s_add_u32 s4, s4, s3
1695; GFX1010-PAL-NEXT:    s_addc_u32 s5, s5, 0
1696; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
1697; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
1698; GFX1010-PAL-NEXT:    s_load_dword s0, s[0:1], 0x24
1699; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
1700; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, vcc_lo offset:4 glc dlc
1701; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
1702; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, 15
1703; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1704; GFX1010-PAL-NEXT:    s_and_b32 s1, s0, 15
1705; GFX1010-PAL-NEXT:    s_lshl_b32 s0, s0, 2
1706; GFX1010-PAL-NEXT:    s_lshl_b32 s1, s1, 2
1707; GFX1010-PAL-NEXT:    s_addk_i32 s0, 0x4004
1708; GFX1010-PAL-NEXT:    s_addk_i32 s1, 0x4004
1709; GFX1010-PAL-NEXT:    scratch_store_dword off, v0, s0
1710; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1711; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
1712; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
1713; GFX1010-PAL-NEXT:    s_endpgm
1714;
1715; GFX1030-PAL-LABEL: store_load_sindex_large_offset_kernel:
1716; GFX1030-PAL:       ; %bb.0: ; %bb
1717; GFX1030-PAL-NEXT:    s_getpc_b64 s[4:5]
1718; GFX1030-PAL-NEXT:    s_mov_b32 s4, s0
1719; GFX1030-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
1720; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1721; GFX1030-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
1722; GFX1030-PAL-NEXT:    s_add_u32 s4, s4, s3
1723; GFX1030-PAL-NEXT:    s_addc_u32 s5, s5, 0
1724; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
1725; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
1726; GFX1030-PAL-NEXT:    s_load_dword s0, s[0:1], 0x24
1727; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
1728; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
1729; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, 15
1730; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1731; GFX1030-PAL-NEXT:    s_and_b32 s1, s0, 15
1732; GFX1030-PAL-NEXT:    s_lshl_b32 s0, s0, 2
1733; GFX1030-PAL-NEXT:    s_lshl_b32 s1, s1, 2
1734; GFX1030-PAL-NEXT:    s_addk_i32 s0, 0x4004
1735; GFX1030-PAL-NEXT:    s_addk_i32 s1, 0x4004
1736; GFX1030-PAL-NEXT:    scratch_store_dword off, v0, s0
1737; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1738; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
1739; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
1740; GFX1030-PAL-NEXT:    s_endpgm
1741bb:
1742  %padding = alloca [4096 x i32], align 4, addrspace(5)
1743  %i = alloca [32 x float], align 4, addrspace(5)
1744  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
1745  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
1746  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
1747  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
1748  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
1749  store volatile i32 15, i32 addrspace(5)* %i8, align 4
1750  %i9 = and i32 %idx, 15
1751  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
1752  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
1753  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
1754  ret void
1755}
1756
1757define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) {
1758; GFX9-LABEL: store_load_sindex_large_offset_foo:
1759; GFX9:       ; %bb.0: ; %bb
1760; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
1761; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
1762; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
1763; GFX9-NEXT:    s_lshl_b32 s0, s2, 2
1764; GFX9-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
1765; GFX9-NEXT:    s_waitcnt vmcnt(0)
1766; GFX9-NEXT:    s_addk_i32 s0, 0x4004
1767; GFX9-NEXT:    v_mov_b32_e32 v0, 15
1768; GFX9-NEXT:    scratch_store_dword off, v0, s0
1769; GFX9-NEXT:    s_waitcnt vmcnt(0)
1770; GFX9-NEXT:    s_and_b32 s0, s2, 15
1771; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
1772; GFX9-NEXT:    s_addk_i32 s0, 0x4004
1773; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
1774; GFX9-NEXT:    s_waitcnt vmcnt(0)
1775; GFX9-NEXT:    s_endpgm
1776;
1777; GFX10-LABEL: store_load_sindex_large_offset_foo:
1778; GFX10:       ; %bb.0: ; %bb
1779; GFX10-NEXT:    s_add_u32 s0, s0, s3
1780; GFX10-NEXT:    s_addc_u32 s1, s1, 0
1781; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1782; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1783; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
1784; GFX10-NEXT:    s_waitcnt vmcnt(0)
1785; GFX10-NEXT:    s_and_b32 s0, s2, 15
1786; GFX10-NEXT:    v_mov_b32_e32 v0, 15
1787; GFX10-NEXT:    s_lshl_b32 s1, s2, 2
1788; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
1789; GFX10-NEXT:    s_addk_i32 s1, 0x4004
1790; GFX10-NEXT:    s_addk_i32 s0, 0x4004
1791; GFX10-NEXT:    scratch_store_dword off, v0, s1
1792; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1793; GFX10-NEXT:    scratch_load_dword v0, off, s0 glc dlc
1794; GFX10-NEXT:    s_waitcnt vmcnt(0)
1795; GFX10-NEXT:    s_endpgm
1796;
1797; GFX9-PAL-LABEL: store_load_sindex_large_offset_foo:
1798; GFX9-PAL:       ; %bb.0: ; %bb
1799; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
1800; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
1801; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1802; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
1803; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1804; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1805; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
1806; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
1807; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
1808; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
1809; GFX9-PAL-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
1810; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1811; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
1812; GFX9-PAL-NEXT:    s_addk_i32 s1, 0x4004
1813; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
1814; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
1815; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1816; GFX9-PAL-NEXT:    s_addk_i32 s0, 0x4004
1817; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 glc
1818; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1819; GFX9-PAL-NEXT:    s_endpgm
1820;
1821; GFX1010-PAL-LABEL: store_load_sindex_large_offset_foo:
1822; GFX1010-PAL:       ; %bb.0: ; %bb
1823; GFX1010-PAL-NEXT:    s_getpc_b64 s[2:3]
1824; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
1825; GFX1010-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1826; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1827; GFX1010-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1828; GFX1010-PAL-NEXT:    s_add_u32 s2, s2, s1
1829; GFX1010-PAL-NEXT:    s_addc_u32 s3, s3, 0
1830; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1831; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1832; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
1833; GFX1010-PAL-NEXT:    s_and_b32 s1, s0, 15
1834; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, vcc_lo offset:4 glc dlc
1835; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
1836; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, 15
1837; GFX1010-PAL-NEXT:    s_lshl_b32 s0, s0, 2
1838; GFX1010-PAL-NEXT:    s_lshl_b32 s1, s1, 2
1839; GFX1010-PAL-NEXT:    s_addk_i32 s0, 0x4004
1840; GFX1010-PAL-NEXT:    s_addk_i32 s1, 0x4004
1841; GFX1010-PAL-NEXT:    scratch_store_dword off, v0, s0
1842; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1843; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
1844; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
1845; GFX1010-PAL-NEXT:    s_endpgm
1846;
1847; GFX1030-PAL-LABEL: store_load_sindex_large_offset_foo:
1848; GFX1030-PAL:       ; %bb.0: ; %bb
1849; GFX1030-PAL-NEXT:    s_getpc_b64 s[2:3]
1850; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
1851; GFX1030-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1852; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1853; GFX1030-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1854; GFX1030-PAL-NEXT:    s_add_u32 s2, s2, s1
1855; GFX1030-PAL-NEXT:    s_addc_u32 s3, s3, 0
1856; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1857; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1858; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
1859; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
1860; GFX1030-PAL-NEXT:    s_and_b32 s1, s0, 15
1861; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, 15
1862; GFX1030-PAL-NEXT:    s_lshl_b32 s0, s0, 2
1863; GFX1030-PAL-NEXT:    s_lshl_b32 s1, s1, 2
1864; GFX1030-PAL-NEXT:    s_addk_i32 s0, 0x4004
1865; GFX1030-PAL-NEXT:    s_addk_i32 s1, 0x4004
1866; GFX1030-PAL-NEXT:    scratch_store_dword off, v0, s0
1867; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1868; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
1869; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
1870; GFX1030-PAL-NEXT:    s_endpgm
1871bb:
1872  %padding = alloca [4096 x i32], align 4, addrspace(5)
1873  %i = alloca [32 x float], align 4, addrspace(5)
1874  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
1875  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
1876  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
1877  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
1878  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
1879  store volatile i32 15, i32 addrspace(5)* %i8, align 4
1880  %i9 = and i32 %idx, 15
1881  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
1882  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
1883  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
1884  ret void
1885}
1886
1887define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
1888; GFX9-LABEL: store_load_vindex_large_offset_kernel:
1889; GFX9:       ; %bb.0: ; %bb
1890; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
1891; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
1892; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
1893; GFX9-NEXT:    scratch_load_dword v1, off, vcc_hi offset:4 glc
1894; GFX9-NEXT:    s_waitcnt vmcnt(0)
1895; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1896; GFX9-NEXT:    v_mov_b32_e32 v1, 0x4004
1897; GFX9-NEXT:    v_add_u32_e32 v2, v1, v0
1898; GFX9-NEXT:    v_mov_b32_e32 v3, 15
1899; GFX9-NEXT:    scratch_store_dword v2, v3, off
1900; GFX9-NEXT:    s_waitcnt vmcnt(0)
1901; GFX9-NEXT:    v_sub_u32_e32 v0, v1, v0
1902; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
1903; GFX9-NEXT:    s_waitcnt vmcnt(0)
1904; GFX9-NEXT:    s_endpgm
1905;
1906; GFX10-LABEL: store_load_vindex_large_offset_kernel:
1907; GFX10:       ; %bb.0: ; %bb
1908; GFX10-NEXT:    s_add_u32 s0, s0, s3
1909; GFX10-NEXT:    s_addc_u32 s1, s1, 0
1910; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1911; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1912; GFX10-NEXT:    v_mov_b32_e32 v1, 0x4004
1913; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1914; GFX10-NEXT:    v_mov_b32_e32 v3, 15
1915; GFX10-NEXT:    v_add_nc_u32_e32 v2, v1, v0
1916; GFX10-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
1917; GFX10-NEXT:    scratch_load_dword v1, off, off offset:4 glc dlc
1918; GFX10-NEXT:    s_waitcnt vmcnt(0)
1919; GFX10-NEXT:    scratch_store_dword v2, v3, off
1920; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1921; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
1922; GFX10-NEXT:    s_waitcnt vmcnt(0)
1923; GFX10-NEXT:    s_endpgm
1924;
1925; GFX9-PAL-LABEL: store_load_vindex_large_offset_kernel:
1926; GFX9-PAL:       ; %bb.0: ; %bb
1927; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
1928; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
1929; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1930; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
1931; GFX9-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1932; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 15
1933; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1934; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1935; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
1936; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
1937; GFX9-PAL-NEXT:    scratch_load_dword v1, off, vcc_hi offset:4 glc
1938; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1939; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 0x4004
1940; GFX9-PAL-NEXT:    v_add_u32_e32 v2, v1, v0
1941; GFX9-PAL-NEXT:    scratch_store_dword v2, v3, off
1942; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1943; GFX9-PAL-NEXT:    v_sub_u32_e32 v0, v1, v0
1944; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
1945; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1946; GFX9-PAL-NEXT:    s_endpgm
1947;
1948; GFX1010-PAL-LABEL: store_load_vindex_large_offset_kernel:
1949; GFX1010-PAL:       ; %bb.0: ; %bb
1950; GFX1010-PAL-NEXT:    s_getpc_b64 s[2:3]
1951; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
1952; GFX1010-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1953; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1954; GFX1010-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1955; GFX1010-PAL-NEXT:    s_add_u32 s2, s2, s1
1956; GFX1010-PAL-NEXT:    s_addc_u32 s3, s3, 0
1957; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1958; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1959; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, 0x4004
1960; GFX1010-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1961; GFX1010-PAL-NEXT:    v_mov_b32_e32 v3, 15
1962; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
1963; GFX1010-PAL-NEXT:    v_add_nc_u32_e32 v2, v1, v0
1964; GFX1010-PAL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
1965; GFX1010-PAL-NEXT:    scratch_load_dword v1, off, vcc_lo offset:4 glc dlc
1966; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
1967; GFX1010-PAL-NEXT:    scratch_store_dword v2, v3, off
1968; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1969; GFX1010-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
1970; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
1971; GFX1010-PAL-NEXT:    s_endpgm
1972;
1973; GFX1030-PAL-LABEL: store_load_vindex_large_offset_kernel:
1974; GFX1030-PAL:       ; %bb.0: ; %bb
1975; GFX1030-PAL-NEXT:    s_getpc_b64 s[2:3]
1976; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
1977; GFX1030-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1978; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1979; GFX1030-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1980; GFX1030-PAL-NEXT:    s_add_u32 s2, s2, s1
1981; GFX1030-PAL-NEXT:    s_addc_u32 s3, s3, 0
1982; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1983; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1984; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, 0x4004
1985; GFX1030-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1986; GFX1030-PAL-NEXT:    v_mov_b32_e32 v3, 15
1987; GFX1030-PAL-NEXT:    v_add_nc_u32_e32 v2, v1, v0
1988; GFX1030-PAL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
1989; GFX1030-PAL-NEXT:    scratch_load_dword v1, off, off offset:4 glc dlc
1990; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
1991; GFX1030-PAL-NEXT:    scratch_store_dword v2, v3, off
1992; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1993; GFX1030-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
1994; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
1995; GFX1030-PAL-NEXT:    s_endpgm
1996bb:
1997  %padding = alloca [4096 x i32], align 4, addrspace(5)
1998  %i = alloca [32 x float], align 4, addrspace(5)
1999  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
2000  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
2001  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
2002  %i2 = tail call i32 @llvm.amdgcn.workitem.id.x()
2003  %i3 = zext i32 %i2 to i64
2004  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2
2005  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
2006  store volatile i32 15, i32 addrspace(5)* %i8, align 4
2007  %i9 = sub nsw i32 31, %i2
2008  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
2009  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
2010  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
2011  ret void
2012}
2013
2014define void @store_load_vindex_large_offset_foo(i32 %idx) {
2015; GFX9-LABEL: store_load_vindex_large_offset_foo:
2016; GFX9:       ; %bb.0: ; %bb
2017; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2018; GFX9-NEXT:    scratch_load_dword v1, off, s32 glc
2019; GFX9-NEXT:    s_waitcnt vmcnt(0)
2020; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 0x4000
2021; GFX9-NEXT:    v_mov_b32_e32 v1, vcc_hi
2022; GFX9-NEXT:    v_mov_b32_e32 v3, 15
2023; GFX9-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
2024; GFX9-NEXT:    v_and_b32_e32 v0, v0, v3
2025; GFX9-NEXT:    scratch_store_dword v2, v3, off
2026; GFX9-NEXT:    s_waitcnt vmcnt(0)
2027; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
2028; GFX9-NEXT:    scratch_load_dword v0, v0, off glc
2029; GFX9-NEXT:    s_waitcnt vmcnt(0)
2030; GFX9-NEXT:    s_setpc_b64 s[30:31]
2031;
2032; GFX10-LABEL: store_load_vindex_large_offset_foo:
2033; GFX10:       ; %bb.0: ; %bb
2034; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2035; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2036; GFX10-NEXT:    v_mov_b32_e32 v1, 15
2037; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x4000
2038; GFX10-NEXT:    v_mov_b32_e32 v2, vcc_lo
2039; GFX10-NEXT:    v_and_b32_e32 v3, v0, v1
2040; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, v2
2041; GFX10-NEXT:    v_lshl_add_u32 v2, v3, 2, v2
2042; GFX10-NEXT:    scratch_load_dword v3, off, s32 glc dlc
2043; GFX10-NEXT:    s_waitcnt vmcnt(0)
2044; GFX10-NEXT:    scratch_store_dword v0, v1, off
2045; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2046; GFX10-NEXT:    scratch_load_dword v0, v2, off glc dlc
2047; GFX10-NEXT:    s_waitcnt vmcnt(0)
2048; GFX10-NEXT:    s_setpc_b64 s[30:31]
2049;
2050; GFX9-PAL-LABEL: store_load_vindex_large_offset_foo:
2051; GFX9-PAL:       ; %bb.0: ; %bb
2052; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2053; GFX9-PAL-NEXT:    scratch_load_dword v1, off, s32 glc
2054; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2055; GFX9-PAL-NEXT:    s_add_i32 vcc_hi, s32, 0x4000
2056; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, vcc_hi
2057; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 15
2058; GFX9-PAL-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
2059; GFX9-PAL-NEXT:    v_and_b32_e32 v0, v0, v3
2060; GFX9-PAL-NEXT:    scratch_store_dword v2, v3, off
2061; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2062; GFX9-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
2063; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off glc
2064; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2065; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
2066;
2067; GFX10-PAL-LABEL: store_load_vindex_large_offset_foo:
2068; GFX10-PAL:       ; %bb.0: ; %bb
2069; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2070; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2071; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
2072; GFX10-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4000
2073; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, vcc_lo
2074; GFX10-PAL-NEXT:    v_and_b32_e32 v3, v0, v1
2075; GFX10-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v2
2076; GFX10-PAL-NEXT:    v_lshl_add_u32 v2, v3, 2, v2
2077; GFX10-PAL-NEXT:    scratch_load_dword v3, off, s32 glc dlc
2078; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
2079; GFX10-PAL-NEXT:    scratch_store_dword v0, v1, off
2080; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2081; GFX10-PAL-NEXT:    scratch_load_dword v0, v2, off glc dlc
2082; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
2083; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
2084bb:
2085  %padding = alloca [4096 x i32], align 4, addrspace(5)
2086  %i = alloca [32 x float], align 4, addrspace(5)
2087  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
2088  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
2089  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
2090  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
2091  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
2092  store volatile i32 15, i32 addrspace(5)* %i8, align 4
2093  %i9 = and i32 %idx, 15
2094  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
2095  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
2096  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
2097  ret void
2098}
2099
2100define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
2101; GFX9-LABEL: store_load_large_imm_offset_kernel:
2102; GFX9:       ; %bb.0: ; %bb
2103; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
2104; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
2105; GFX9-NEXT:    s_movk_i32 s0, 0x3000
2106; GFX9-NEXT:    v_mov_b32_e32 v0, 13
2107; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
2108; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:4
2109; GFX9-NEXT:    s_waitcnt vmcnt(0)
2110; GFX9-NEXT:    s_add_i32 s0, s0, 4
2111; GFX9-NEXT:    v_mov_b32_e32 v0, 15
2112; GFX9-NEXT:    scratch_store_dword off, v0, s0 offset:3712
2113; GFX9-NEXT:    s_waitcnt vmcnt(0)
2114; GFX9-NEXT:    scratch_load_dword v0, off, s0 offset:3712 glc
2115; GFX9-NEXT:    s_waitcnt vmcnt(0)
2116; GFX9-NEXT:    s_endpgm
2117;
2118; GFX10-LABEL: store_load_large_imm_offset_kernel:
2119; GFX10:       ; %bb.0: ; %bb
2120; GFX10-NEXT:    s_add_u32 s0, s0, s3
2121; GFX10-NEXT:    s_addc_u32 s1, s1, 0
2122; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
2123; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
2124; GFX10-NEXT:    v_mov_b32_e32 v0, 13
2125; GFX10-NEXT:    v_mov_b32_e32 v1, 15
2126; GFX10-NEXT:    s_movk_i32 s0, 0x3800
2127; GFX10-NEXT:    s_add_i32 s0, s0, 4
2128; GFX10-NEXT:    scratch_store_dword off, v0, off offset:4
2129; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2130; GFX10-NEXT:    scratch_store_dword off, v1, s0 offset:1664
2131; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2132; GFX10-NEXT:    scratch_load_dword v0, off, s0 offset:1664 glc dlc
2133; GFX10-NEXT:    s_waitcnt vmcnt(0)
2134; GFX10-NEXT:    s_endpgm
2135;
2136; GFX9-PAL-LABEL: store_load_large_imm_offset_kernel:
2137; GFX9-PAL:       ; %bb.0: ; %bb
2138; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
2139; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
2140; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
2141; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 13
2142; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
2143; GFX9-PAL-NEXT:    s_movk_i32 s0, 0x3000
2144; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2145; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
2146; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
2147; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
2148; GFX9-PAL-NEXT:    scratch_store_dword off, v0, vcc_hi offset:4
2149; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2150; GFX9-PAL-NEXT:    s_add_i32 s0, s0, 4
2151; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
2152; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s0 offset:3712
2153; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2154; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 offset:3712 glc
2155; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2156; GFX9-PAL-NEXT:    s_endpgm
2157;
2158; GFX1010-PAL-LABEL: store_load_large_imm_offset_kernel:
2159; GFX1010-PAL:       ; %bb.0: ; %bb
2160; GFX1010-PAL-NEXT:    s_getpc_b64 s[2:3]
2161; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
2162; GFX1010-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
2163; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2164; GFX1010-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
2165; GFX1010-PAL-NEXT:    s_add_u32 s2, s2, s1
2166; GFX1010-PAL-NEXT:    s_addc_u32 s3, s3, 0
2167; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
2168; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
2169; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, 13
2170; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, 15
2171; GFX1010-PAL-NEXT:    s_movk_i32 s0, 0x3800
2172; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
2173; GFX1010-PAL-NEXT:    s_add_i32 s0, s0, 4
2174; GFX1010-PAL-NEXT:    scratch_store_dword off, v0, vcc_lo offset:4
2175; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2176; GFX1010-PAL-NEXT:    scratch_store_dword off, v1, s0 offset:1664
2177; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2178; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s0 offset:1664 glc dlc
2179; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
2180; GFX1010-PAL-NEXT:    s_endpgm
2181;
2182; GFX1030-PAL-LABEL: store_load_large_imm_offset_kernel:
2183; GFX1030-PAL:       ; %bb.0: ; %bb
2184; GFX1030-PAL-NEXT:    s_getpc_b64 s[2:3]
2185; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
2186; GFX1030-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
2187; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2188; GFX1030-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
2189; GFX1030-PAL-NEXT:    s_add_u32 s2, s2, s1
2190; GFX1030-PAL-NEXT:    s_addc_u32 s3, s3, 0
2191; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
2192; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
2193; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, 13
2194; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, 15
2195; GFX1030-PAL-NEXT:    s_movk_i32 s0, 0x3800
2196; GFX1030-PAL-NEXT:    s_add_i32 s0, s0, 4
2197; GFX1030-PAL-NEXT:    scratch_store_dword off, v0, off offset:4
2198; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2199; GFX1030-PAL-NEXT:    scratch_store_dword off, v1, s0 offset:1664
2200; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2201; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, s0 offset:1664 glc dlc
2202; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
2203; GFX1030-PAL-NEXT:    s_endpgm
2204bb:
2205  %i = alloca [4096 x i32], align 4, addrspace(5)
2206  %i1 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 undef
2207  store volatile i32 13, i32 addrspace(5)* %i1, align 4
2208  %i7 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000
2209  store volatile i32 15, i32 addrspace(5)* %i7, align 4
2210  %i10 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000
2211  %i12 = load volatile i32, i32 addrspace(5)* %i10, align 4
2212  ret void
2213}
2214
2215define void @store_load_large_imm_offset_foo() {
2216; GFX9-LABEL: store_load_large_imm_offset_foo:
2217; GFX9:       ; %bb.0: ; %bb
2218; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2219; GFX9-NEXT:    s_movk_i32 s0, 0x3000
2220; GFX9-NEXT:    v_mov_b32_e32 v0, 13
2221; GFX9-NEXT:    scratch_store_dword off, v0, s32
2222; GFX9-NEXT:    s_waitcnt vmcnt(0)
2223; GFX9-NEXT:    s_add_i32 s0, s0, s32
2224; GFX9-NEXT:    v_mov_b32_e32 v0, 15
2225; GFX9-NEXT:    scratch_store_dword off, v0, s0 offset:3712
2226; GFX9-NEXT:    s_waitcnt vmcnt(0)
2227; GFX9-NEXT:    scratch_load_dword v0, off, s0 offset:3712 glc
2228; GFX9-NEXT:    s_waitcnt vmcnt(0)
2229; GFX9-NEXT:    s_setpc_b64 s[30:31]
2230;
2231; GFX10-LABEL: store_load_large_imm_offset_foo:
2232; GFX10:       ; %bb.0: ; %bb
2233; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2234; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2235; GFX10-NEXT:    v_mov_b32_e32 v0, 13
2236; GFX10-NEXT:    v_mov_b32_e32 v1, 15
2237; GFX10-NEXT:    s_movk_i32 s0, 0x3800
2238; GFX10-NEXT:    s_add_i32 s0, s0, s32
2239; GFX10-NEXT:    scratch_store_dword off, v0, s32
2240; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2241; GFX10-NEXT:    scratch_store_dword off, v1, s0 offset:1664
2242; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2243; GFX10-NEXT:    scratch_load_dword v0, off, s0 offset:1664 glc dlc
2244; GFX10-NEXT:    s_waitcnt vmcnt(0)
2245; GFX10-NEXT:    s_setpc_b64 s[30:31]
2246;
2247; GFX9-PAL-LABEL: store_load_large_imm_offset_foo:
2248; GFX9-PAL:       ; %bb.0: ; %bb
2249; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2250; GFX9-PAL-NEXT:    s_movk_i32 s0, 0x3000
2251; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 13
2252; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s32
2253; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2254; GFX9-PAL-NEXT:    s_add_i32 s0, s0, s32
2255; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
2256; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s0 offset:3712
2257; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2258; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 offset:3712 glc
2259; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2260; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
2261;
2262; GFX10-PAL-LABEL: store_load_large_imm_offset_foo:
2263; GFX10-PAL:       ; %bb.0: ; %bb
2264; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2265; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2266; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, 13
2267; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
2268; GFX10-PAL-NEXT:    s_movk_i32 s0, 0x3800
2269; GFX10-PAL-NEXT:    s_add_i32 s0, s0, s32
2270; GFX10-PAL-NEXT:    scratch_store_dword off, v0, s32
2271; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2272; GFX10-PAL-NEXT:    scratch_store_dword off, v1, s0 offset:1664
2273; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2274; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s0 offset:1664 glc dlc
2275; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
2276; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
2277bb:
2278  %i = alloca [4096 x i32], align 4, addrspace(5)
2279  %i1 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 undef
2280  store volatile i32 13, i32 addrspace(5)* %i1, align 4
2281  %i7 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000
2282  store volatile i32 15, i32 addrspace(5)* %i7, align 4
2283  %i10 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000
2284  %i12 = load volatile i32, i32 addrspace(5)* %i10, align 4
2285  ret void
2286}
2287
2288define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
2289; GFX9-LABEL: store_load_vidx_sidx_offset:
2290; GFX9:       ; %bb.0: ; %bb
2291; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
2292; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
2293; GFX9-NEXT:    v_mov_b32_e32 v1, 4
2294; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
2295; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2296; GFX9-NEXT:    v_add_u32_e32 v0, s0, v0
2297; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
2298; GFX9-NEXT:    v_mov_b32_e32 v1, 15
2299; GFX9-NEXT:    scratch_store_dword v0, v1, off offset:1024
2300; GFX9-NEXT:    s_waitcnt vmcnt(0)
2301; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:1024 glc
2302; GFX9-NEXT:    s_waitcnt vmcnt(0)
2303; GFX9-NEXT:    s_endpgm
2304;
2305; GFX10-LABEL: store_load_vidx_sidx_offset:
2306; GFX10:       ; %bb.0: ; %bb
2307; GFX10-NEXT:    s_add_u32 s2, s2, s5
2308; GFX10-NEXT:    s_addc_u32 s3, s3, 0
2309; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
2310; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
2311; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x24
2312; GFX10-NEXT:    v_mov_b32_e32 v1, 15
2313; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2314; GFX10-NEXT:    v_add_nc_u32_e32 v0, s0, v0
2315; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, 4
2316; GFX10-NEXT:    scratch_store_dword v0, v1, off offset:1024
2317; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2318; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:1024 glc dlc
2319; GFX10-NEXT:    s_waitcnt vmcnt(0)
2320; GFX10-NEXT:    s_endpgm
2321;
2322; GFX9-PAL-LABEL: store_load_vidx_sidx_offset:
2323; GFX9-PAL:       ; %bb.0: ; %bb
2324; GFX9-PAL-NEXT:    s_getpc_b64 s[4:5]
2325; GFX9-PAL-NEXT:    s_mov_b32 s4, s0
2326; GFX9-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
2327; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 4
2328; GFX9-PAL-NEXT:    s_load_dword s0, s[0:1], 0x24
2329; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2330; GFX9-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
2331; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s4, s3
2332; GFX9-PAL-NEXT:    v_add_u32_e32 v0, s0, v0
2333; GFX9-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
2334; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
2335; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 15
2336; GFX9-PAL-NEXT:    scratch_store_dword v0, v1, off offset:1024
2337; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2338; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off offset:1024 glc
2339; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2340; GFX9-PAL-NEXT:    s_endpgm
2341;
2342; GFX10-PAL-LABEL: store_load_vidx_sidx_offset:
2343; GFX10-PAL:       ; %bb.0: ; %bb
2344; GFX10-PAL-NEXT:    s_getpc_b64 s[4:5]
2345; GFX10-PAL-NEXT:    s_mov_b32 s4, s0
2346; GFX10-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
2347; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2348; GFX10-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
2349; GFX10-PAL-NEXT:    s_add_u32 s4, s4, s3
2350; GFX10-PAL-NEXT:    s_addc_u32 s5, s5, 0
2351; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
2352; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
2353; GFX10-PAL-NEXT:    s_load_dword s0, s[0:1], 0x24
2354; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
2355; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2356; GFX10-PAL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
2357; GFX10-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, 4
2358; GFX10-PAL-NEXT:    scratch_store_dword v0, v1, off offset:1024
2359; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2360; GFX10-PAL-NEXT:    scratch_load_dword v0, v0, off offset:1024 glc dlc
2361; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
2362; GFX10-PAL-NEXT:    s_endpgm
2363bb:
2364  %alloca = alloca [32 x i32], align 4, addrspace(5)
2365  %vidx = tail call i32 @llvm.amdgcn.workitem.id.x()
2366  %add1 = add nsw i32 %sidx, %vidx
2367  %add2 = add nsw i32 %add1, 256
2368  %gep = getelementptr inbounds [32 x i32], [32 x i32] addrspace(5)* %alloca, i32 0, i32 %add2
2369  store volatile i32 15, i32 addrspace(5)* %gep, align 4
2370  %load = load volatile i32, i32 addrspace(5)* %gep, align 4
2371  ret void
2372}
2373
2374define void @store_load_i64_aligned(i64 addrspace(5)* nocapture %arg) {
2375; GFX9-LABEL: store_load_i64_aligned:
2376; GFX9:       ; %bb.0: ; %bb
2377; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2378; GFX9-NEXT:    v_mov_b32_e32 v1, 15
2379; GFX9-NEXT:    v_mov_b32_e32 v2, 0
2380; GFX9-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
2381; GFX9-NEXT:    s_waitcnt vmcnt(0)
2382; GFX9-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc
2383; GFX9-NEXT:    s_waitcnt vmcnt(0)
2384; GFX9-NEXT:    s_setpc_b64 s[30:31]
2385;
2386; GFX10-LABEL: store_load_i64_aligned:
2387; GFX10:       ; %bb.0: ; %bb
2388; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2389; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2390; GFX10-NEXT:    v_mov_b32_e32 v1, 15
2391; GFX10-NEXT:    v_mov_b32_e32 v2, 0
2392; GFX10-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
2393; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2394; GFX10-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc dlc
2395; GFX10-NEXT:    s_waitcnt vmcnt(0)
2396; GFX10-NEXT:    s_setpc_b64 s[30:31]
2397;
2398; GFX9-PAL-LABEL: store_load_i64_aligned:
2399; GFX9-PAL:       ; %bb.0: ; %bb
2400; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2401; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 15
2402; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, 0
2403; GFX9-PAL-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
2404; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2405; GFX9-PAL-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc
2406; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2407; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
2408;
2409; GFX10-PAL-LABEL: store_load_i64_aligned:
2410; GFX10-PAL:       ; %bb.0: ; %bb
2411; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2412; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2413; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
2414; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 0
2415; GFX10-PAL-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
2416; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2417; GFX10-PAL-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc dlc
2418; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
2419; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
2420bb:
2421  store volatile i64 15, i64 addrspace(5)* %arg, align 8
2422  %load = load volatile i64, i64 addrspace(5)* %arg, align 8
2423  ret void
2424}
2425
2426define void @store_load_i64_unaligned(i64 addrspace(5)* nocapture %arg) {
2427; GFX9-LABEL: store_load_i64_unaligned:
2428; GFX9:       ; %bb.0: ; %bb
2429; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2430; GFX9-NEXT:    v_mov_b32_e32 v1, 15
2431; GFX9-NEXT:    v_mov_b32_e32 v2, 0
2432; GFX9-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
2433; GFX9-NEXT:    s_waitcnt vmcnt(0)
2434; GFX9-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc
2435; GFX9-NEXT:    s_waitcnt vmcnt(0)
2436; GFX9-NEXT:    s_setpc_b64 s[30:31]
2437;
2438; GFX10-LABEL: store_load_i64_unaligned:
2439; GFX10:       ; %bb.0: ; %bb
2440; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2441; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2442; GFX10-NEXT:    v_mov_b32_e32 v1, 15
2443; GFX10-NEXT:    v_mov_b32_e32 v2, 0
2444; GFX10-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
2445; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2446; GFX10-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc dlc
2447; GFX10-NEXT:    s_waitcnt vmcnt(0)
2448; GFX10-NEXT:    s_setpc_b64 s[30:31]
2449;
2450; GFX9-PAL-LABEL: store_load_i64_unaligned:
2451; GFX9-PAL:       ; %bb.0: ; %bb
2452; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2453; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 15
2454; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, 0
2455; GFX9-PAL-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
2456; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2457; GFX9-PAL-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc
2458; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2459; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
2460;
2461; GFX10-PAL-LABEL: store_load_i64_unaligned:
2462; GFX10-PAL:       ; %bb.0: ; %bb
2463; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2464; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2465; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
2466; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 0
2467; GFX10-PAL-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
2468; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2469; GFX10-PAL-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc dlc
2470; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
2471; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
2472bb:
2473  store volatile i64 15, i64 addrspace(5)* %arg, align 1
2474  %load = load volatile i64, i64 addrspace(5)* %arg, align 1
2475  ret void
2476}
2477
2478define void @store_load_v3i32_unaligned(<3 x i32> addrspace(5)* nocapture %arg) {
2479; GFX9-LABEL: store_load_v3i32_unaligned:
2480; GFX9:       ; %bb.0: ; %bb
2481; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2482; GFX9-NEXT:    v_mov_b32_e32 v1, 1
2483; GFX9-NEXT:    v_mov_b32_e32 v2, 2
2484; GFX9-NEXT:    v_mov_b32_e32 v3, 3
2485; GFX9-NEXT:    scratch_store_dwordx3 v0, v[1:3], off
2486; GFX9-NEXT:    s_waitcnt vmcnt(0)
2487; GFX9-NEXT:    scratch_load_dwordx3 v[0:2], v0, off glc
2488; GFX9-NEXT:    s_waitcnt vmcnt(0)
2489; GFX9-NEXT:    s_setpc_b64 s[30:31]
2490;
2491; GFX10-LABEL: store_load_v3i32_unaligned:
2492; GFX10:       ; %bb.0: ; %bb
2493; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2494; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2495; GFX10-NEXT:    v_mov_b32_e32 v1, 1
2496; GFX10-NEXT:    v_mov_b32_e32 v2, 2
2497; GFX10-NEXT:    v_mov_b32_e32 v3, 3
2498; GFX10-NEXT:    scratch_store_dwordx3 v0, v[1:3], off
2499; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2500; GFX10-NEXT:    scratch_load_dwordx3 v[0:2], v0, off glc dlc
2501; GFX10-NEXT:    s_waitcnt vmcnt(0)
2502; GFX10-NEXT:    s_setpc_b64 s[30:31]
2503;
2504; GFX9-PAL-LABEL: store_load_v3i32_unaligned:
2505; GFX9-PAL:       ; %bb.0: ; %bb
2506; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2507; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 1
2508; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, 2
2509; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 3
2510; GFX9-PAL-NEXT:    scratch_store_dwordx3 v0, v[1:3], off
2511; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2512; GFX9-PAL-NEXT:    scratch_load_dwordx3 v[0:2], v0, off glc
2513; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2514; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
2515;
2516; GFX10-PAL-LABEL: store_load_v3i32_unaligned:
2517; GFX10-PAL:       ; %bb.0: ; %bb
2518; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2519; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2520; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 1
2521; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 2
2522; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, 3
2523; GFX10-PAL-NEXT:    scratch_store_dwordx3 v0, v[1:3], off
2524; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2525; GFX10-PAL-NEXT:    scratch_load_dwordx3 v[0:2], v0, off glc dlc
2526; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
2527; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
2528bb:
2529  store volatile <3 x i32> <i32 1, i32 2, i32 3>, <3 x i32> addrspace(5)* %arg, align 1
2530  %load = load volatile <3 x i32>, <3 x i32> addrspace(5)* %arg, align 1
2531  ret void
2532}
2533
2534define void @store_load_v4i32_unaligned(<4 x i32> addrspace(5)* nocapture %arg) {
2535; GFX9-LABEL: store_load_v4i32_unaligned:
2536; GFX9:       ; %bb.0: ; %bb
2537; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2538; GFX9-NEXT:    v_mov_b32_e32 v1, 1
2539; GFX9-NEXT:    v_mov_b32_e32 v2, 2
2540; GFX9-NEXT:    v_mov_b32_e32 v3, 3
2541; GFX9-NEXT:    v_mov_b32_e32 v4, 4
2542; GFX9-NEXT:    scratch_store_dwordx4 v0, v[1:4], off
2543; GFX9-NEXT:    s_waitcnt vmcnt(0)
2544; GFX9-NEXT:    scratch_load_dwordx4 v[0:3], v0, off glc
2545; GFX9-NEXT:    s_waitcnt vmcnt(0)
2546; GFX9-NEXT:    s_setpc_b64 s[30:31]
2547;
2548; GFX10-LABEL: store_load_v4i32_unaligned:
2549; GFX10:       ; %bb.0: ; %bb
2550; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2551; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2552; GFX10-NEXT:    v_mov_b32_e32 v1, 1
2553; GFX10-NEXT:    v_mov_b32_e32 v2, 2
2554; GFX10-NEXT:    v_mov_b32_e32 v3, 3
2555; GFX10-NEXT:    v_mov_b32_e32 v4, 4
2556; GFX10-NEXT:    scratch_store_dwordx4 v0, v[1:4], off
2557; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2558; GFX10-NEXT:    scratch_load_dwordx4 v[0:3], v0, off glc dlc
2559; GFX10-NEXT:    s_waitcnt vmcnt(0)
2560; GFX10-NEXT:    s_setpc_b64 s[30:31]
2561;
2562; GFX9-PAL-LABEL: store_load_v4i32_unaligned:
2563; GFX9-PAL:       ; %bb.0: ; %bb
2564; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2565; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 1
2566; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, 2
2567; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 3
2568; GFX9-PAL-NEXT:    v_mov_b32_e32 v4, 4
2569; GFX9-PAL-NEXT:    scratch_store_dwordx4 v0, v[1:4], off
2570; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2571; GFX9-PAL-NEXT:    scratch_load_dwordx4 v[0:3], v0, off glc
2572; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2573; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
2574;
2575; GFX10-PAL-LABEL: store_load_v4i32_unaligned:
2576; GFX10-PAL:       ; %bb.0: ; %bb
2577; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2578; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2579; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 1
2580; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 2
2581; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, 3
2582; GFX10-PAL-NEXT:    v_mov_b32_e32 v4, 4
2583; GFX10-PAL-NEXT:    scratch_store_dwordx4 v0, v[1:4], off
2584; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2585; GFX10-PAL-NEXT:    scratch_load_dwordx4 v[0:3], v0, off glc dlc
2586; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
2587; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
2588bb:
2589  store volatile <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(5)* %arg, align 1
2590  %load = load volatile <4 x i32>, <4 x i32> addrspace(5)* %arg, align 1
2591  ret void
2592}
2593
2594define void @store_load_i32_negative_unaligned(i8 addrspace(5)* nocapture %arg) {
2595; GFX9-LABEL: store_load_i32_negative_unaligned:
2596; GFX9:       ; %bb.0: ; %bb
2597; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2598; GFX9-NEXT:    v_add_u32_e32 v0, -1, v0
2599; GFX9-NEXT:    v_mov_b32_e32 v1, 1
2600; GFX9-NEXT:    scratch_store_byte v0, v1, off
2601; GFX9-NEXT:    s_waitcnt vmcnt(0)
2602; GFX9-NEXT:    scratch_load_ubyte v0, v0, off glc
2603; GFX9-NEXT:    s_waitcnt vmcnt(0)
2604; GFX9-NEXT:    s_setpc_b64 s[30:31]
2605;
2606; GFX10-LABEL: store_load_i32_negative_unaligned:
2607; GFX10:       ; %bb.0: ; %bb
2608; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2609; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2610; GFX10-NEXT:    v_mov_b32_e32 v1, 1
2611; GFX10-NEXT:    scratch_store_byte v0, v1, off offset:-1
2612; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2613; GFX10-NEXT:    scratch_load_ubyte v0, v0, off offset:-1 glc dlc
2614; GFX10-NEXT:    s_waitcnt vmcnt(0)
2615; GFX10-NEXT:    s_setpc_b64 s[30:31]
2616;
2617; GFX9-PAL-LABEL: store_load_i32_negative_unaligned:
2618; GFX9-PAL:       ; %bb.0: ; %bb
2619; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2620; GFX9-PAL-NEXT:    v_add_u32_e32 v0, -1, v0
2621; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 1
2622; GFX9-PAL-NEXT:    scratch_store_byte v0, v1, off
2623; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2624; GFX9-PAL-NEXT:    scratch_load_ubyte v0, v0, off glc
2625; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2626; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
2627;
2628; GFX1010-PAL-LABEL: store_load_i32_negative_unaligned:
2629; GFX1010-PAL:       ; %bb.0: ; %bb
2630; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2631; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2632; GFX1010-PAL-NEXT:    v_add_nc_u32_e32 v0, -1, v0
2633; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, 1
2634; GFX1010-PAL-NEXT:    scratch_store_byte v0, v1, off
2635; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2636; GFX1010-PAL-NEXT:    scratch_load_ubyte v0, v0, off glc dlc
2637; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
2638; GFX1010-PAL-NEXT:    s_setpc_b64 s[30:31]
2639;
2640; GFX1030-PAL-LABEL: store_load_i32_negative_unaligned:
2641; GFX1030-PAL:       ; %bb.0: ; %bb
2642; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2643; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2644; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, 1
2645; GFX1030-PAL-NEXT:    scratch_store_byte v0, v1, off offset:-1
2646; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2647; GFX1030-PAL-NEXT:    scratch_load_ubyte v0, v0, off offset:-1 glc dlc
2648; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
2649; GFX1030-PAL-NEXT:    s_setpc_b64 s[30:31]
2650bb:
2651  %ptr = getelementptr inbounds i8, i8 addrspace(5)* %arg, i32 -1
2652  store volatile i8 1, i8 addrspace(5)* %ptr, align 1
2653  %load = load volatile i8, i8 addrspace(5)* %ptr, align 1
2654  ret void
2655}
2656
2657define void @store_load_i32_large_negative_unaligned(i8 addrspace(5)* nocapture %arg) {
2658; GFX9-LABEL: store_load_i32_large_negative_unaligned:
2659; GFX9:       ; %bb.0: ; %bb
2660; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2661; GFX9-NEXT:    v_add_u32_e32 v0, 0xffffef7f, v0
2662; GFX9-NEXT:    v_mov_b32_e32 v1, 1
2663; GFX9-NEXT:    scratch_store_byte v0, v1, off
2664; GFX9-NEXT:    s_waitcnt vmcnt(0)
2665; GFX9-NEXT:    scratch_load_ubyte v0, v0, off glc
2666; GFX9-NEXT:    s_waitcnt vmcnt(0)
2667; GFX9-NEXT:    s_setpc_b64 s[30:31]
2668;
2669; GFX10-LABEL: store_load_i32_large_negative_unaligned:
2670; GFX10:       ; %bb.0: ; %bb
2671; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2672; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2673; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0xfffff000, v0
2674; GFX10-NEXT:    v_mov_b32_e32 v1, 1
2675; GFX10-NEXT:    scratch_store_byte v0, v1, off offset:-129
2676; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2677; GFX10-NEXT:    scratch_load_ubyte v0, v0, off offset:-129 glc dlc
2678; GFX10-NEXT:    s_waitcnt vmcnt(0)
2679; GFX10-NEXT:    s_setpc_b64 s[30:31]
2680;
2681; GFX9-PAL-LABEL: store_load_i32_large_negative_unaligned:
2682; GFX9-PAL:       ; %bb.0: ; %bb
2683; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2684; GFX9-PAL-NEXT:    v_add_u32_e32 v0, 0xffffef7f, v0
2685; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 1
2686; GFX9-PAL-NEXT:    scratch_store_byte v0, v1, off
2687; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2688; GFX9-PAL-NEXT:    scratch_load_ubyte v0, v0, off glc
2689; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2690; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
2691;
2692; GFX1010-PAL-LABEL: store_load_i32_large_negative_unaligned:
2693; GFX1010-PAL:       ; %bb.0: ; %bb
2694; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2695; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2696; GFX1010-PAL-NEXT:    v_add_nc_u32_e32 v0, 0xffffefff, v0
2697; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, 1
2698; GFX1010-PAL-NEXT:    scratch_store_byte v0, v1, off offset:-128
2699; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2700; GFX1010-PAL-NEXT:    scratch_load_ubyte v0, v0, off offset:-128 glc dlc
2701; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
2702; GFX1010-PAL-NEXT:    s_setpc_b64 s[30:31]
2703;
2704; GFX1030-PAL-LABEL: store_load_i32_large_negative_unaligned:
2705; GFX1030-PAL:       ; %bb.0: ; %bb
2706; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2707; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2708; GFX1030-PAL-NEXT:    v_add_nc_u32_e32 v0, 0xfffff000, v0
2709; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, 1
2710; GFX1030-PAL-NEXT:    scratch_store_byte v0, v1, off offset:-129
2711; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2712; GFX1030-PAL-NEXT:    scratch_load_ubyte v0, v0, off offset:-129 glc dlc
2713; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
2714; GFX1030-PAL-NEXT:    s_setpc_b64 s[30:31]
2715bb:
2716  %ptr = getelementptr inbounds i8, i8 addrspace(5)* %arg, i32 -4225
2717  store volatile i8 1, i8 addrspace(5)* %ptr, align 1
2718  %load = load volatile i8, i8 addrspace(5)* %ptr, align 1
2719  ret void
2720}
2721
2722declare void @llvm.memset.p5i8.i64(i8 addrspace(5)* nocapture writeonly, i8, i64, i1 immarg)
2723declare i32 @llvm.amdgcn.workitem.id.x()
2724