1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
3; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s
4; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
5
6; FIXME:
7; XUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefix=GFX6 %s
8
9define amdgpu_kernel void @store_lds_v3i32(<3 x i32> addrspace(3)* %out, <3 x i32> %x) {
10; GFX9-LABEL: store_lds_v3i32:
11; GFX9:       ; %bb.0:
12; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x24
13; GFX9-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
14; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
15; GFX9-NEXT:    v_mov_b32_e32 v3, s2
16; GFX9-NEXT:    v_mov_b32_e32 v0, s12
17; GFX9-NEXT:    v_mov_b32_e32 v1, s13
18; GFX9-NEXT:    v_mov_b32_e32 v2, s14
19; GFX9-NEXT:    ds_write_b96 v3, v[0:2]
20; GFX9-NEXT:    s_endpgm
21;
22; GFX7-LABEL: store_lds_v3i32:
23; GFX7:       ; %bb.0:
24; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x9
25; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
26; GFX7-NEXT:    s_mov_b32 m0, -1
27; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
28; GFX7-NEXT:    v_mov_b32_e32 v3, s4
29; GFX7-NEXT:    v_mov_b32_e32 v0, s0
30; GFX7-NEXT:    v_mov_b32_e32 v1, s1
31; GFX7-NEXT:    v_mov_b32_e32 v2, s2
32; GFX7-NEXT:    ds_write_b96 v3, v[0:2]
33; GFX7-NEXT:    s_endpgm
34;
35; GFX10-LABEL: store_lds_v3i32:
36; GFX10:       ; %bb.0:
37; GFX10-NEXT:    s_clause 0x1
38; GFX10-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
39; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x24
40; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
41; GFX10-NEXT:    v_mov_b32_e32 v0, s12
42; GFX10-NEXT:    v_mov_b32_e32 v1, s13
43; GFX10-NEXT:    v_mov_b32_e32 v2, s14
44; GFX10-NEXT:    v_mov_b32_e32 v3, s2
45; GFX10-NEXT:    ds_write_b96 v3, v[0:2]
46; GFX10-NEXT:    s_endpgm
47  store <3 x i32> %x, <3 x i32> addrspace(3)* %out
48  ret void
49}
50
51define amdgpu_kernel void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out, <3 x i32> %x) {
52; GFX9-LABEL: store_lds_v3i32_align1:
53; GFX9:       ; %bb.0:
54; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x24
55; GFX9-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
56; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
57; GFX9-NEXT:    v_mov_b32_e32 v1, s2
58; GFX9-NEXT:    s_lshr_b32 s0, s12, 8
59; GFX9-NEXT:    v_mov_b32_e32 v0, s12
60; GFX9-NEXT:    s_lshr_b32 s1, s12, 16
61; GFX9-NEXT:    ds_write_b8 v1, v0
62; GFX9-NEXT:    v_mov_b32_e32 v0, s0
63; GFX9-NEXT:    s_lshr_b32 s3, s12, 24
64; GFX9-NEXT:    ds_write_b8 v1, v0 offset:1
65; GFX9-NEXT:    v_mov_b32_e32 v0, s1
66; GFX9-NEXT:    ds_write_b8 v1, v0 offset:2
67; GFX9-NEXT:    v_mov_b32_e32 v0, s3
68; GFX9-NEXT:    ds_write_b8 v1, v0 offset:3
69; GFX9-NEXT:    s_lshr_b32 s0, s13, 8
70; GFX9-NEXT:    v_mov_b32_e32 v0, s13
71; GFX9-NEXT:    s_lshr_b32 s1, s13, 16
72; GFX9-NEXT:    ds_write_b8 v1, v0 offset:4
73; GFX9-NEXT:    v_mov_b32_e32 v0, s0
74; GFX9-NEXT:    s_lshr_b32 s2, s13, 24
75; GFX9-NEXT:    ds_write_b8 v1, v0 offset:5
76; GFX9-NEXT:    v_mov_b32_e32 v0, s1
77; GFX9-NEXT:    ds_write_b8 v1, v0 offset:6
78; GFX9-NEXT:    v_mov_b32_e32 v0, s2
79; GFX9-NEXT:    ds_write_b8 v1, v0 offset:7
80; GFX9-NEXT:    s_lshr_b32 s0, s14, 8
81; GFX9-NEXT:    v_mov_b32_e32 v0, s14
82; GFX9-NEXT:    s_lshr_b32 s1, s14, 16
83; GFX9-NEXT:    ds_write_b8 v1, v0 offset:8
84; GFX9-NEXT:    v_mov_b32_e32 v0, s0
85; GFX9-NEXT:    s_lshr_b32 s2, s14, 24
86; GFX9-NEXT:    ds_write_b8 v1, v0 offset:9
87; GFX9-NEXT:    v_mov_b32_e32 v0, s1
88; GFX9-NEXT:    ds_write_b8 v1, v0 offset:10
89; GFX9-NEXT:    v_mov_b32_e32 v0, s2
90; GFX9-NEXT:    ds_write_b8 v1, v0 offset:11
91; GFX9-NEXT:    s_endpgm
92;
93; GFX7-LABEL: store_lds_v3i32_align1:
94; GFX7:       ; %bb.0:
95; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x9
96; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
97; GFX7-NEXT:    s_mov_b32 m0, -1
98; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
99; GFX7-NEXT:    v_mov_b32_e32 v1, s4
100; GFX7-NEXT:    s_lshr_b32 s3, s0, 8
101; GFX7-NEXT:    v_mov_b32_e32 v0, s0
102; GFX7-NEXT:    s_lshr_b32 s5, s0, 16
103; GFX7-NEXT:    ds_write_b8 v1, v0
104; GFX7-NEXT:    v_mov_b32_e32 v0, s3
105; GFX7-NEXT:    s_lshr_b32 s6, s0, 24
106; GFX7-NEXT:    ds_write_b8 v1, v0 offset:1
107; GFX7-NEXT:    v_mov_b32_e32 v0, s5
108; GFX7-NEXT:    ds_write_b8 v1, v0 offset:2
109; GFX7-NEXT:    v_mov_b32_e32 v0, s6
110; GFX7-NEXT:    ds_write_b8 v1, v0 offset:3
111; GFX7-NEXT:    s_lshr_b32 s0, s1, 8
112; GFX7-NEXT:    v_mov_b32_e32 v0, s1
113; GFX7-NEXT:    s_lshr_b32 s3, s1, 16
114; GFX7-NEXT:    ds_write_b8 v1, v0 offset:4
115; GFX7-NEXT:    v_mov_b32_e32 v0, s0
116; GFX7-NEXT:    s_lshr_b32 s4, s1, 24
117; GFX7-NEXT:    ds_write_b8 v1, v0 offset:5
118; GFX7-NEXT:    v_mov_b32_e32 v0, s3
119; GFX7-NEXT:    ds_write_b8 v1, v0 offset:6
120; GFX7-NEXT:    v_mov_b32_e32 v0, s4
121; GFX7-NEXT:    ds_write_b8 v1, v0 offset:7
122; GFX7-NEXT:    s_lshr_b32 s0, s2, 8
123; GFX7-NEXT:    v_mov_b32_e32 v0, s2
124; GFX7-NEXT:    s_lshr_b32 s1, s2, 16
125; GFX7-NEXT:    ds_write_b8 v1, v0 offset:8
126; GFX7-NEXT:    v_mov_b32_e32 v0, s0
127; GFX7-NEXT:    s_lshr_b32 s3, s2, 24
128; GFX7-NEXT:    ds_write_b8 v1, v0 offset:9
129; GFX7-NEXT:    v_mov_b32_e32 v0, s1
130; GFX7-NEXT:    ds_write_b8 v1, v0 offset:10
131; GFX7-NEXT:    v_mov_b32_e32 v0, s3
132; GFX7-NEXT:    ds_write_b8 v1, v0 offset:11
133; GFX7-NEXT:    s_endpgm
134;
135; GFX10-LABEL: store_lds_v3i32_align1:
136; GFX10:       ; %bb.0:
137; GFX10-NEXT:    s_clause 0x1
138; GFX10-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
139; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x24
140; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
141; GFX10-NEXT:    s_lshr_b32 s0, s12, 8
142; GFX10-NEXT:    v_mov_b32_e32 v0, s12
143; GFX10-NEXT:    v_mov_b32_e32 v1, s2
144; GFX10-NEXT:    s_lshr_b32 s5, s13, 24
145; GFX10-NEXT:    s_lshr_b32 s1, s12, 16
146; GFX10-NEXT:    v_mov_b32_e32 v2, s13
147; GFX10-NEXT:    s_lshr_b32 s3, s12, 24
148; GFX10-NEXT:    s_lshr_b32 s6, s14, 8
149; GFX10-NEXT:    v_mov_b32_e32 v4, s0
150; GFX10-NEXT:    v_mov_b32_e32 v9, s5
151; GFX10-NEXT:    s_lshr_b32 s2, s13, 8
152; GFX10-NEXT:    s_lshr_b32 s4, s13, 16
153; GFX10-NEXT:    s_lshr_b32 s7, s14, 16
154; GFX10-NEXT:    v_mov_b32_e32 v3, s14
155; GFX10-NEXT:    v_mov_b32_e32 v5, s1
156; GFX10-NEXT:    s_lshr_b32 s8, s14, 24
157; GFX10-NEXT:    v_mov_b32_e32 v6, s3
158; GFX10-NEXT:    v_mov_b32_e32 v10, s6
159; GFX10-NEXT:    v_mov_b32_e32 v7, s2
160; GFX10-NEXT:    v_mov_b32_e32 v8, s4
161; GFX10-NEXT:    ds_write_b8 v1, v0
162; GFX10-NEXT:    ds_write_b8 v1, v2 offset:4
163; GFX10-NEXT:    ds_write_b8 v1, v4 offset:1
164; GFX10-NEXT:    ds_write_b8 v1, v5 offset:2
165; GFX10-NEXT:    ds_write_b8 v1, v6 offset:3
166; GFX10-NEXT:    ds_write_b8 v1, v7 offset:5
167; GFX10-NEXT:    ds_write_b8 v1, v8 offset:6
168; GFX10-NEXT:    v_mov_b32_e32 v0, s7
169; GFX10-NEXT:    v_mov_b32_e32 v2, s8
170; GFX10-NEXT:    ds_write_b8 v1, v9 offset:7
171; GFX10-NEXT:    ds_write_b8 v1, v3 offset:8
172; GFX10-NEXT:    ds_write_b8 v1, v10 offset:9
173; GFX10-NEXT:    ds_write_b8 v1, v0 offset:10
174; GFX10-NEXT:    ds_write_b8 v1, v2 offset:11
175; GFX10-NEXT:    s_endpgm
176  store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 1
177  ret void
178}
179
180define amdgpu_kernel void @store_lds_v3i32_align2(<3 x i32> addrspace(3)* %out, <3 x i32> %x) {
181; GFX9-LABEL: store_lds_v3i32_align2:
182; GFX9:       ; %bb.0:
183; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x24
184; GFX9-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
185; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
186; GFX9-NEXT:    v_mov_b32_e32 v1, s2
187; GFX9-NEXT:    s_lshr_b32 s0, s12, 16
188; GFX9-NEXT:    v_mov_b32_e32 v0, s12
189; GFX9-NEXT:    ds_write_b16 v1, v0
190; GFX9-NEXT:    v_mov_b32_e32 v0, s0
191; GFX9-NEXT:    ds_write_b16 v1, v0 offset:2
192; GFX9-NEXT:    s_lshr_b32 s0, s13, 16
193; GFX9-NEXT:    v_mov_b32_e32 v0, s13
194; GFX9-NEXT:    ds_write_b16 v1, v0 offset:4
195; GFX9-NEXT:    v_mov_b32_e32 v0, s0
196; GFX9-NEXT:    ds_write_b16 v1, v0 offset:6
197; GFX9-NEXT:    s_lshr_b32 s0, s14, 16
198; GFX9-NEXT:    v_mov_b32_e32 v0, s14
199; GFX9-NEXT:    ds_write_b16 v1, v0 offset:8
200; GFX9-NEXT:    v_mov_b32_e32 v0, s0
201; GFX9-NEXT:    ds_write_b16 v1, v0 offset:10
202; GFX9-NEXT:    s_endpgm
203;
204; GFX7-LABEL: store_lds_v3i32_align2:
205; GFX7:       ; %bb.0:
206; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x9
207; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
208; GFX7-NEXT:    s_mov_b32 m0, -1
209; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
210; GFX7-NEXT:    v_mov_b32_e32 v1, s4
211; GFX7-NEXT:    s_lshr_b32 s3, s0, 16
212; GFX7-NEXT:    v_mov_b32_e32 v0, s0
213; GFX7-NEXT:    ds_write_b16 v1, v0
214; GFX7-NEXT:    v_mov_b32_e32 v0, s3
215; GFX7-NEXT:    ds_write_b16 v1, v0 offset:2
216; GFX7-NEXT:    s_lshr_b32 s0, s1, 16
217; GFX7-NEXT:    v_mov_b32_e32 v0, s1
218; GFX7-NEXT:    ds_write_b16 v1, v0 offset:4
219; GFX7-NEXT:    v_mov_b32_e32 v0, s0
220; GFX7-NEXT:    ds_write_b16 v1, v0 offset:6
221; GFX7-NEXT:    s_lshr_b32 s0, s2, 16
222; GFX7-NEXT:    v_mov_b32_e32 v0, s2
223; GFX7-NEXT:    ds_write_b16 v1, v0 offset:8
224; GFX7-NEXT:    v_mov_b32_e32 v0, s0
225; GFX7-NEXT:    ds_write_b16 v1, v0 offset:10
226; GFX7-NEXT:    s_endpgm
227;
228; GFX10-LABEL: store_lds_v3i32_align2:
229; GFX10:       ; %bb.0:
230; GFX10-NEXT:    s_clause 0x1
231; GFX10-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
232; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x24
233; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
234; GFX10-NEXT:    v_mov_b32_e32 v0, s12
235; GFX10-NEXT:    v_mov_b32_e32 v1, s2
236; GFX10-NEXT:    s_lshr_b32 s0, s12, 16
237; GFX10-NEXT:    v_mov_b32_e32 v2, s13
238; GFX10-NEXT:    s_lshr_b32 s1, s13, 16
239; GFX10-NEXT:    v_mov_b32_e32 v3, s14
240; GFX10-NEXT:    s_lshr_b32 s2, s14, 16
241; GFX10-NEXT:    v_mov_b32_e32 v4, s0
242; GFX10-NEXT:    v_mov_b32_e32 v5, s1
243; GFX10-NEXT:    v_mov_b32_e32 v6, s2
244; GFX10-NEXT:    ds_write_b16 v1, v0
245; GFX10-NEXT:    ds_write_b16 v1, v2 offset:4
246; GFX10-NEXT:    ds_write_b16 v1, v3 offset:8
247; GFX10-NEXT:    ds_write_b16 v1, v4 offset:2
248; GFX10-NEXT:    ds_write_b16 v1, v5 offset:6
249; GFX10-NEXT:    ds_write_b16 v1, v6 offset:10
250; GFX10-NEXT:    s_endpgm
251  store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 2
252  ret void
253}
254
255define amdgpu_kernel void @store_lds_v3i32_align4(<3 x i32> addrspace(3)* %out, <3 x i32> %x) {
256; GFX9-LABEL: store_lds_v3i32_align4:
257; GFX9:       ; %bb.0:
258; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x24
259; GFX9-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
260; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
261; GFX9-NEXT:    v_mov_b32_e32 v2, s2
262; GFX9-NEXT:    v_mov_b32_e32 v0, s12
263; GFX9-NEXT:    v_mov_b32_e32 v1, s13
264; GFX9-NEXT:    v_mov_b32_e32 v3, s14
265; GFX9-NEXT:    ds_write2_b32 v2, v0, v1 offset1:1
266; GFX9-NEXT:    ds_write_b32 v2, v3 offset:8
267; GFX9-NEXT:    s_endpgm
268;
269; GFX7-LABEL: store_lds_v3i32_align4:
270; GFX7:       ; %bb.0:
271; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x9
272; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
273; GFX7-NEXT:    s_mov_b32 m0, -1
274; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
275; GFX7-NEXT:    v_mov_b32_e32 v2, s4
276; GFX7-NEXT:    v_mov_b32_e32 v0, s0
277; GFX7-NEXT:    v_mov_b32_e32 v1, s1
278; GFX7-NEXT:    v_mov_b32_e32 v3, s2
279; GFX7-NEXT:    ds_write2_b32 v2, v0, v1 offset1:1
280; GFX7-NEXT:    ds_write_b32 v2, v3 offset:8
281; GFX7-NEXT:    s_endpgm
282;
283; GFX10-LABEL: store_lds_v3i32_align4:
284; GFX10:       ; %bb.0:
285; GFX10-NEXT:    s_clause 0x1
286; GFX10-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
287; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x24
288; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
289; GFX10-NEXT:    v_mov_b32_e32 v0, s12
290; GFX10-NEXT:    v_mov_b32_e32 v1, s13
291; GFX10-NEXT:    v_mov_b32_e32 v2, s2
292; GFX10-NEXT:    v_mov_b32_e32 v3, s14
293; GFX10-NEXT:    ds_write2_b32 v2, v0, v1 offset1:1
294; GFX10-NEXT:    ds_write_b32 v2, v3 offset:8
295; GFX10-NEXT:    s_endpgm
296  store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 4
297  ret void
298}
299
300define amdgpu_kernel void @store_lds_v3i32_align8(<3 x i32> addrspace(3)* %out, <3 x i32> %x) {
301; GFX9-LABEL: store_lds_v3i32_align8:
302; GFX9:       ; %bb.0:
303; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x24
304; GFX9-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
305; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
306; GFX9-NEXT:    v_mov_b32_e32 v2, s2
307; GFX9-NEXT:    v_mov_b32_e32 v0, s12
308; GFX9-NEXT:    v_mov_b32_e32 v1, s13
309; GFX9-NEXT:    v_mov_b32_e32 v3, s14
310; GFX9-NEXT:    ds_write_b64 v2, v[0:1]
311; GFX9-NEXT:    ds_write_b32 v2, v3 offset:8
312; GFX9-NEXT:    s_endpgm
313;
314; GFX7-LABEL: store_lds_v3i32_align8:
315; GFX7:       ; %bb.0:
316; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x9
317; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
318; GFX7-NEXT:    s_mov_b32 m0, -1
319; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
320; GFX7-NEXT:    v_mov_b32_e32 v2, s4
321; GFX7-NEXT:    v_mov_b32_e32 v0, s0
322; GFX7-NEXT:    v_mov_b32_e32 v1, s1
323; GFX7-NEXT:    v_mov_b32_e32 v3, s2
324; GFX7-NEXT:    ds_write_b64 v2, v[0:1]
325; GFX7-NEXT:    ds_write_b32 v2, v3 offset:8
326; GFX7-NEXT:    s_endpgm
327;
328; GFX10-LABEL: store_lds_v3i32_align8:
329; GFX10:       ; %bb.0:
330; GFX10-NEXT:    s_clause 0x1
331; GFX10-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
332; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x24
333; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
334; GFX10-NEXT:    v_mov_b32_e32 v0, s12
335; GFX10-NEXT:    v_mov_b32_e32 v1, s13
336; GFX10-NEXT:    v_mov_b32_e32 v2, s2
337; GFX10-NEXT:    v_mov_b32_e32 v3, s14
338; GFX10-NEXT:    ds_write_b64 v2, v[0:1]
339; GFX10-NEXT:    ds_write_b32 v2, v3 offset:8
340; GFX10-NEXT:    s_endpgm
341  store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 8
342  ret void
343}
344
345define amdgpu_kernel void @store_lds_v3i32_align16(<3 x i32> addrspace(3)* %out, <3 x i32> %x) {
346; GFX9-LABEL: store_lds_v3i32_align16:
347; GFX9:       ; %bb.0:
348; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x24
349; GFX9-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
350; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
351; GFX9-NEXT:    v_mov_b32_e32 v3, s2
352; GFX9-NEXT:    v_mov_b32_e32 v0, s12
353; GFX9-NEXT:    v_mov_b32_e32 v1, s13
354; GFX9-NEXT:    v_mov_b32_e32 v2, s14
355; GFX9-NEXT:    ds_write_b96 v3, v[0:2]
356; GFX9-NEXT:    s_endpgm
357;
358; GFX7-LABEL: store_lds_v3i32_align16:
359; GFX7:       ; %bb.0:
360; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x9
361; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
362; GFX7-NEXT:    s_mov_b32 m0, -1
363; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
364; GFX7-NEXT:    v_mov_b32_e32 v3, s4
365; GFX7-NEXT:    v_mov_b32_e32 v0, s0
366; GFX7-NEXT:    v_mov_b32_e32 v1, s1
367; GFX7-NEXT:    v_mov_b32_e32 v2, s2
368; GFX7-NEXT:    ds_write_b96 v3, v[0:2]
369; GFX7-NEXT:    s_endpgm
370;
371; GFX10-LABEL: store_lds_v3i32_align16:
372; GFX10:       ; %bb.0:
373; GFX10-NEXT:    s_clause 0x1
374; GFX10-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
375; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x24
376; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
377; GFX10-NEXT:    v_mov_b32_e32 v0, s12
378; GFX10-NEXT:    v_mov_b32_e32 v1, s13
379; GFX10-NEXT:    v_mov_b32_e32 v2, s14
380; GFX10-NEXT:    v_mov_b32_e32 v3, s2
381; GFX10-NEXT:    ds_write_b96 v3, v[0:2]
382; GFX10-NEXT:    s_endpgm
383  store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 16
384  ret void
385}
386