1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
3; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s
4; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefix=GFX6 %s
5; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
6
7define amdgpu_kernel void @store_lds_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> %x) {
8; GFX9-LABEL: store_lds_v4i32:
9; GFX9:       ; %bb.0:
10; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x24
11; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
12; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
13; GFX9-NEXT:    v_mov_b32_e32 v4, s2
14; GFX9-NEXT:    v_mov_b32_e32 v0, s4
15; GFX9-NEXT:    v_mov_b32_e32 v1, s5
16; GFX9-NEXT:    v_mov_b32_e32 v2, s6
17; GFX9-NEXT:    v_mov_b32_e32 v3, s7
18; GFX9-NEXT:    ds_write_b128 v4, v[0:3]
19; GFX9-NEXT:    s_endpgm
20;
21; GFX7-LABEL: store_lds_v4i32:
22; GFX7:       ; %bb.0:
23; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x9
24; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
25; GFX7-NEXT:    s_mov_b32 m0, -1
26; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
27; GFX7-NEXT:    v_mov_b32_e32 v4, s4
28; GFX7-NEXT:    v_mov_b32_e32 v0, s0
29; GFX7-NEXT:    v_mov_b32_e32 v1, s1
30; GFX7-NEXT:    v_mov_b32_e32 v2, s2
31; GFX7-NEXT:    v_mov_b32_e32 v3, s3
32; GFX7-NEXT:    ds_write_b128 v4, v[0:3]
33; GFX7-NEXT:    s_endpgm
34;
35; GFX6-LABEL: store_lds_v4i32:
36; GFX6:       ; %bb.0:
37; GFX6-NEXT:    s_load_dword s4, s[0:1], 0x9
38; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
39; GFX6-NEXT:    s_mov_b32 m0, -1
40; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
41; GFX6-NEXT:    v_mov_b32_e32 v4, s4
42; GFX6-NEXT:    v_mov_b32_e32 v0, s2
43; GFX6-NEXT:    v_mov_b32_e32 v1, s3
44; GFX6-NEXT:    v_mov_b32_e32 v2, s0
45; GFX6-NEXT:    v_mov_b32_e32 v3, s1
46; GFX6-NEXT:    ds_write2_b64 v4, v[2:3], v[0:1] offset1:1
47; GFX6-NEXT:    s_endpgm
48;
49; GFX10-LABEL: store_lds_v4i32:
50; GFX10:       ; %bb.0:
51; GFX10-NEXT:    s_clause 0x1
52; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x24
53; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
54; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
55; GFX10-NEXT:    v_mov_b32_e32 v4, s2
56; GFX10-NEXT:    v_mov_b32_e32 v0, s4
57; GFX10-NEXT:    v_mov_b32_e32 v1, s5
58; GFX10-NEXT:    v_mov_b32_e32 v2, s6
59; GFX10-NEXT:    v_mov_b32_e32 v3, s7
60; GFX10-NEXT:    ds_write_b128 v4, v[0:3]
61; GFX10-NEXT:    s_endpgm
62  store <4 x i32> %x, <4 x i32> addrspace(3)* %out
63  ret void
64}
65
66define amdgpu_kernel void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out, <4 x i32> %x) {
67; GFX9-LABEL: store_lds_v4i32_align1:
68; GFX9:       ; %bb.0:
69; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x24
70; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
71; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
72; GFX9-NEXT:    v_mov_b32_e32 v0, s2
73; GFX9-NEXT:    v_mov_b32_e32 v1, s7
74; GFX9-NEXT:    v_mov_b32_e32 v2, s6
75; GFX9-NEXT:    ds_write_b8 v0, v1 offset:12
76; GFX9-NEXT:    ds_write_b8_d16_hi v0, v1 offset:14
77; GFX9-NEXT:    ds_write_b8 v0, v2 offset:8
78; GFX9-NEXT:    ds_write_b8_d16_hi v0, v2 offset:10
79; GFX9-NEXT:    v_mov_b32_e32 v1, s5
80; GFX9-NEXT:    ds_write_b8 v0, v1 offset:4
81; GFX9-NEXT:    ds_write_b8_d16_hi v0, v1 offset:6
82; GFX9-NEXT:    v_mov_b32_e32 v1, s4
83; GFX9-NEXT:    s_lshr_b32 s0, s7, 8
84; GFX9-NEXT:    ds_write_b8 v0, v1
85; GFX9-NEXT:    ds_write_b8_d16_hi v0, v1 offset:2
86; GFX9-NEXT:    v_mov_b32_e32 v1, s0
87; GFX9-NEXT:    s_lshr_b32 s0, s7, 24
88; GFX9-NEXT:    ds_write_b8 v0, v1 offset:13
89; GFX9-NEXT:    v_mov_b32_e32 v1, s0
90; GFX9-NEXT:    s_lshr_b32 s0, s6, 8
91; GFX9-NEXT:    ds_write_b8 v0, v1 offset:15
92; GFX9-NEXT:    v_mov_b32_e32 v1, s0
93; GFX9-NEXT:    s_lshr_b32 s0, s6, 24
94; GFX9-NEXT:    ds_write_b8 v0, v1 offset:9
95; GFX9-NEXT:    v_mov_b32_e32 v1, s0
96; GFX9-NEXT:    s_lshr_b32 s0, s5, 8
97; GFX9-NEXT:    ds_write_b8 v0, v1 offset:11
98; GFX9-NEXT:    v_mov_b32_e32 v1, s0
99; GFX9-NEXT:    s_lshr_b32 s0, s5, 24
100; GFX9-NEXT:    ds_write_b8 v0, v1 offset:5
101; GFX9-NEXT:    v_mov_b32_e32 v1, s0
102; GFX9-NEXT:    s_lshr_b32 s0, s4, 8
103; GFX9-NEXT:    ds_write_b8 v0, v1 offset:7
104; GFX9-NEXT:    v_mov_b32_e32 v1, s0
105; GFX9-NEXT:    s_lshr_b32 s0, s4, 24
106; GFX9-NEXT:    ds_write_b8 v0, v1 offset:1
107; GFX9-NEXT:    v_mov_b32_e32 v1, s0
108; GFX9-NEXT:    ds_write_b8 v0, v1 offset:3
109; GFX9-NEXT:    s_endpgm
110;
111; GFX7-LABEL: store_lds_v4i32_align1:
112; GFX7:       ; %bb.0:
113; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x9
114; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
115; GFX7-NEXT:    s_mov_b32 m0, -1
116; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
117; GFX7-NEXT:    v_mov_b32_e32 v0, s4
118; GFX7-NEXT:    v_mov_b32_e32 v1, s3
119; GFX7-NEXT:    v_mov_b32_e32 v2, s2
120; GFX7-NEXT:    ds_write_b8 v0, v1 offset:12
121; GFX7-NEXT:    ds_write_b8 v0, v2 offset:8
122; GFX7-NEXT:    v_mov_b32_e32 v1, s1
123; GFX7-NEXT:    ds_write_b8 v0, v1 offset:4
124; GFX7-NEXT:    v_mov_b32_e32 v1, s0
125; GFX7-NEXT:    s_lshr_b32 s4, s3, 8
126; GFX7-NEXT:    ds_write_b8 v0, v1
127; GFX7-NEXT:    v_mov_b32_e32 v1, s4
128; GFX7-NEXT:    s_lshr_b32 s4, s3, 24
129; GFX7-NEXT:    ds_write_b8 v0, v1 offset:13
130; GFX7-NEXT:    v_mov_b32_e32 v1, s4
131; GFX7-NEXT:    s_lshr_b32 s3, s3, 16
132; GFX7-NEXT:    ds_write_b8 v0, v1 offset:15
133; GFX7-NEXT:    v_mov_b32_e32 v1, s3
134; GFX7-NEXT:    s_lshr_b32 s3, s2, 8
135; GFX7-NEXT:    ds_write_b8 v0, v1 offset:14
136; GFX7-NEXT:    v_mov_b32_e32 v1, s3
137; GFX7-NEXT:    s_lshr_b32 s3, s2, 24
138; GFX7-NEXT:    ds_write_b8 v0, v1 offset:9
139; GFX7-NEXT:    v_mov_b32_e32 v1, s3
140; GFX7-NEXT:    s_lshr_b32 s2, s2, 16
141; GFX7-NEXT:    ds_write_b8 v0, v1 offset:11
142; GFX7-NEXT:    v_mov_b32_e32 v1, s2
143; GFX7-NEXT:    s_lshr_b32 s2, s1, 8
144; GFX7-NEXT:    ds_write_b8 v0, v1 offset:10
145; GFX7-NEXT:    v_mov_b32_e32 v1, s2
146; GFX7-NEXT:    s_lshr_b32 s2, s1, 24
147; GFX7-NEXT:    ds_write_b8 v0, v1 offset:5
148; GFX7-NEXT:    v_mov_b32_e32 v1, s2
149; GFX7-NEXT:    s_lshr_b32 s1, s1, 16
150; GFX7-NEXT:    ds_write_b8 v0, v1 offset:7
151; GFX7-NEXT:    v_mov_b32_e32 v1, s1
152; GFX7-NEXT:    s_lshr_b32 s1, s0, 8
153; GFX7-NEXT:    ds_write_b8 v0, v1 offset:6
154; GFX7-NEXT:    v_mov_b32_e32 v1, s1
155; GFX7-NEXT:    s_lshr_b32 s1, s0, 24
156; GFX7-NEXT:    ds_write_b8 v0, v1 offset:1
157; GFX7-NEXT:    v_mov_b32_e32 v1, s1
158; GFX7-NEXT:    s_lshr_b32 s0, s0, 16
159; GFX7-NEXT:    ds_write_b8 v0, v1 offset:3
160; GFX7-NEXT:    v_mov_b32_e32 v1, s0
161; GFX7-NEXT:    ds_write_b8 v0, v1 offset:2
162; GFX7-NEXT:    s_endpgm
163;
164; GFX6-LABEL: store_lds_v4i32_align1:
165; GFX6:       ; %bb.0:
166; GFX6-NEXT:    s_load_dword s4, s[0:1], 0x9
167; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
168; GFX6-NEXT:    s_mov_b32 m0, -1
169; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
170; GFX6-NEXT:    v_mov_b32_e32 v0, s4
171; GFX6-NEXT:    v_mov_b32_e32 v1, s3
172; GFX6-NEXT:    v_mov_b32_e32 v2, s2
173; GFX6-NEXT:    ds_write_b8 v0, v1 offset:12
174; GFX6-NEXT:    ds_write_b8 v0, v2 offset:8
175; GFX6-NEXT:    v_mov_b32_e32 v1, s1
176; GFX6-NEXT:    ds_write_b8 v0, v1 offset:4
177; GFX6-NEXT:    v_mov_b32_e32 v1, s0
178; GFX6-NEXT:    s_lshr_b32 s4, s3, 8
179; GFX6-NEXT:    ds_write_b8 v0, v1
180; GFX6-NEXT:    v_mov_b32_e32 v1, s4
181; GFX6-NEXT:    s_lshr_b32 s4, s3, 24
182; GFX6-NEXT:    ds_write_b8 v0, v1 offset:13
183; GFX6-NEXT:    v_mov_b32_e32 v1, s4
184; GFX6-NEXT:    s_lshr_b32 s3, s3, 16
185; GFX6-NEXT:    ds_write_b8 v0, v1 offset:15
186; GFX6-NEXT:    v_mov_b32_e32 v1, s3
187; GFX6-NEXT:    s_lshr_b32 s3, s2, 8
188; GFX6-NEXT:    ds_write_b8 v0, v1 offset:14
189; GFX6-NEXT:    v_mov_b32_e32 v1, s3
190; GFX6-NEXT:    s_lshr_b32 s3, s2, 24
191; GFX6-NEXT:    ds_write_b8 v0, v1 offset:9
192; GFX6-NEXT:    v_mov_b32_e32 v1, s3
193; GFX6-NEXT:    s_lshr_b32 s2, s2, 16
194; GFX6-NEXT:    ds_write_b8 v0, v1 offset:11
195; GFX6-NEXT:    v_mov_b32_e32 v1, s2
196; GFX6-NEXT:    s_lshr_b32 s2, s1, 8
197; GFX6-NEXT:    ds_write_b8 v0, v1 offset:10
198; GFX6-NEXT:    v_mov_b32_e32 v1, s2
199; GFX6-NEXT:    s_lshr_b32 s2, s1, 24
200; GFX6-NEXT:    ds_write_b8 v0, v1 offset:5
201; GFX6-NEXT:    v_mov_b32_e32 v1, s2
202; GFX6-NEXT:    s_lshr_b32 s1, s1, 16
203; GFX6-NEXT:    ds_write_b8 v0, v1 offset:7
204; GFX6-NEXT:    v_mov_b32_e32 v1, s1
205; GFX6-NEXT:    s_lshr_b32 s1, s0, 8
206; GFX6-NEXT:    ds_write_b8 v0, v1 offset:6
207; GFX6-NEXT:    v_mov_b32_e32 v1, s1
208; GFX6-NEXT:    s_lshr_b32 s1, s0, 24
209; GFX6-NEXT:    ds_write_b8 v0, v1 offset:1
210; GFX6-NEXT:    v_mov_b32_e32 v1, s1
211; GFX6-NEXT:    s_lshr_b32 s0, s0, 16
212; GFX6-NEXT:    ds_write_b8 v0, v1 offset:3
213; GFX6-NEXT:    v_mov_b32_e32 v1, s0
214; GFX6-NEXT:    ds_write_b8 v0, v1 offset:2
215; GFX6-NEXT:    s_endpgm
216;
217; GFX10-LABEL: store_lds_v4i32_align1:
218; GFX10:       ; %bb.0:
219; GFX10-NEXT:    s_clause 0x1
220; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x24
221; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
222; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
223; GFX10-NEXT:    v_mov_b32_e32 v0, s2
224; GFX10-NEXT:    v_mov_b32_e32 v1, s7
225; GFX10-NEXT:    s_lshr_b32 s3, s6, 24
226; GFX10-NEXT:    s_lshr_b32 s0, s7, 8
227; GFX10-NEXT:    s_lshr_b32 s2, s6, 8
228; GFX10-NEXT:    v_mov_b32_e32 v2, s6
229; GFX10-NEXT:    s_lshr_b32 s6, s5, 8
230; GFX10-NEXT:    v_mov_b32_e32 v3, s5
231; GFX10-NEXT:    s_lshr_b32 s1, s7, 24
232; GFX10-NEXT:    s_lshr_b32 s5, s5, 24
233; GFX10-NEXT:    v_mov_b32_e32 v8, s3
234; GFX10-NEXT:    v_mov_b32_e32 v5, s0
235; GFX10-NEXT:    v_mov_b32_e32 v9, s6
236; GFX10-NEXT:    s_lshr_b32 s0, s4, 8
237; GFX10-NEXT:    v_mov_b32_e32 v6, s1
238; GFX10-NEXT:    v_mov_b32_e32 v4, s4
239; GFX10-NEXT:    v_mov_b32_e32 v7, s2
240; GFX10-NEXT:    ds_write_b8 v0, v1 offset:12
241; GFX10-NEXT:    ds_write_b8_d16_hi v0, v1 offset:14
242; GFX10-NEXT:    ds_write_b8 v0, v2 offset:8
243; GFX10-NEXT:    ds_write_b8_d16_hi v0, v2 offset:10
244; GFX10-NEXT:    ds_write_b8 v0, v3 offset:4
245; GFX10-NEXT:    ds_write_b8_d16_hi v0, v3 offset:6
246; GFX10-NEXT:    ds_write_b8 v0, v4
247; GFX10-NEXT:    ds_write_b8_d16_hi v0, v4 offset:2
248; GFX10-NEXT:    ds_write_b8 v0, v5 offset:13
249; GFX10-NEXT:    ds_write_b8 v0, v6 offset:15
250; GFX10-NEXT:    ds_write_b8 v0, v7 offset:9
251; GFX10-NEXT:    s_lshr_b32 s1, s4, 24
252; GFX10-NEXT:    v_mov_b32_e32 v1, s5
253; GFX10-NEXT:    v_mov_b32_e32 v2, s0
254; GFX10-NEXT:    v_mov_b32_e32 v3, s1
255; GFX10-NEXT:    ds_write_b8 v0, v8 offset:11
256; GFX10-NEXT:    ds_write_b8 v0, v9 offset:5
257; GFX10-NEXT:    ds_write_b8 v0, v1 offset:7
258; GFX10-NEXT:    ds_write_b8 v0, v2 offset:1
259; GFX10-NEXT:    ds_write_b8 v0, v3 offset:3
260; GFX10-NEXT:    s_endpgm
261  store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 1
262  ret void
263}
264
265define amdgpu_kernel void @store_lds_v4i32_align2(<4 x i32> addrspace(3)* %out, <4 x i32> %x) {
266; GFX9-LABEL: store_lds_v4i32_align2:
267; GFX9:       ; %bb.0:
268; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x24
269; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
270; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
271; GFX9-NEXT:    v_mov_b32_e32 v0, s2
272; GFX9-NEXT:    v_mov_b32_e32 v1, s7
273; GFX9-NEXT:    v_mov_b32_e32 v2, s6
274; GFX9-NEXT:    ds_write_b16 v0, v1 offset:12
275; GFX9-NEXT:    ds_write_b16_d16_hi v0, v1 offset:14
276; GFX9-NEXT:    ds_write_b16 v0, v2 offset:8
277; GFX9-NEXT:    ds_write_b16_d16_hi v0, v2 offset:10
278; GFX9-NEXT:    v_mov_b32_e32 v1, s5
279; GFX9-NEXT:    ds_write_b16 v0, v1 offset:4
280; GFX9-NEXT:    ds_write_b16_d16_hi v0, v1 offset:6
281; GFX9-NEXT:    v_mov_b32_e32 v1, s4
282; GFX9-NEXT:    ds_write_b16 v0, v1
283; GFX9-NEXT:    ds_write_b16_d16_hi v0, v1 offset:2
284; GFX9-NEXT:    s_endpgm
285;
286; GFX7-LABEL: store_lds_v4i32_align2:
287; GFX7:       ; %bb.0:
288; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x9
289; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
290; GFX7-NEXT:    s_mov_b32 m0, -1
291; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
292; GFX7-NEXT:    v_mov_b32_e32 v0, s4
293; GFX7-NEXT:    v_mov_b32_e32 v1, s3
294; GFX7-NEXT:    v_mov_b32_e32 v2, s2
295; GFX7-NEXT:    ds_write_b16 v0, v1 offset:12
296; GFX7-NEXT:    ds_write_b16 v0, v2 offset:8
297; GFX7-NEXT:    v_mov_b32_e32 v1, s1
298; GFX7-NEXT:    ds_write_b16 v0, v1 offset:4
299; GFX7-NEXT:    v_mov_b32_e32 v1, s0
300; GFX7-NEXT:    s_lshr_b32 s3, s3, 16
301; GFX7-NEXT:    ds_write_b16 v0, v1
302; GFX7-NEXT:    v_mov_b32_e32 v1, s3
303; GFX7-NEXT:    s_lshr_b32 s2, s2, 16
304; GFX7-NEXT:    ds_write_b16 v0, v1 offset:14
305; GFX7-NEXT:    v_mov_b32_e32 v1, s2
306; GFX7-NEXT:    s_lshr_b32 s1, s1, 16
307; GFX7-NEXT:    ds_write_b16 v0, v1 offset:10
308; GFX7-NEXT:    v_mov_b32_e32 v1, s1
309; GFX7-NEXT:    s_lshr_b32 s0, s0, 16
310; GFX7-NEXT:    ds_write_b16 v0, v1 offset:6
311; GFX7-NEXT:    v_mov_b32_e32 v1, s0
312; GFX7-NEXT:    ds_write_b16 v0, v1 offset:2
313; GFX7-NEXT:    s_endpgm
314;
315; GFX6-LABEL: store_lds_v4i32_align2:
316; GFX6:       ; %bb.0:
317; GFX6-NEXT:    s_load_dword s4, s[0:1], 0x9
318; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
319; GFX6-NEXT:    s_mov_b32 m0, -1
320; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
321; GFX6-NEXT:    v_mov_b32_e32 v0, s4
322; GFX6-NEXT:    v_mov_b32_e32 v1, s3
323; GFX6-NEXT:    v_mov_b32_e32 v2, s2
324; GFX6-NEXT:    ds_write_b16 v0, v1 offset:12
325; GFX6-NEXT:    ds_write_b16 v0, v2 offset:8
326; GFX6-NEXT:    v_mov_b32_e32 v1, s1
327; GFX6-NEXT:    ds_write_b16 v0, v1 offset:4
328; GFX6-NEXT:    v_mov_b32_e32 v1, s0
329; GFX6-NEXT:    s_lshr_b32 s3, s3, 16
330; GFX6-NEXT:    ds_write_b16 v0, v1
331; GFX6-NEXT:    v_mov_b32_e32 v1, s3
332; GFX6-NEXT:    s_lshr_b32 s2, s2, 16
333; GFX6-NEXT:    ds_write_b16 v0, v1 offset:14
334; GFX6-NEXT:    v_mov_b32_e32 v1, s2
335; GFX6-NEXT:    s_lshr_b32 s1, s1, 16
336; GFX6-NEXT:    ds_write_b16 v0, v1 offset:10
337; GFX6-NEXT:    v_mov_b32_e32 v1, s1
338; GFX6-NEXT:    s_lshr_b32 s0, s0, 16
339; GFX6-NEXT:    ds_write_b16 v0, v1 offset:6
340; GFX6-NEXT:    v_mov_b32_e32 v1, s0
341; GFX6-NEXT:    ds_write_b16 v0, v1 offset:2
342; GFX6-NEXT:    s_endpgm
343;
344; GFX10-LABEL: store_lds_v4i32_align2:
345; GFX10:       ; %bb.0:
346; GFX10-NEXT:    s_clause 0x1
347; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x24
348; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
349; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
350; GFX10-NEXT:    v_mov_b32_e32 v0, s2
351; GFX10-NEXT:    v_mov_b32_e32 v1, s7
352; GFX10-NEXT:    v_mov_b32_e32 v2, s6
353; GFX10-NEXT:    v_mov_b32_e32 v3, s5
354; GFX10-NEXT:    v_mov_b32_e32 v4, s4
355; GFX10-NEXT:    ds_write_b16 v0, v1 offset:12
356; GFX10-NEXT:    ds_write_b16_d16_hi v0, v1 offset:14
357; GFX10-NEXT:    ds_write_b16 v0, v2 offset:8
358; GFX10-NEXT:    ds_write_b16_d16_hi v0, v2 offset:10
359; GFX10-NEXT:    ds_write_b16 v0, v3 offset:4
360; GFX10-NEXT:    ds_write_b16_d16_hi v0, v3 offset:6
361; GFX10-NEXT:    ds_write_b16 v0, v4
362; GFX10-NEXT:    ds_write_b16_d16_hi v0, v4 offset:2
363; GFX10-NEXT:    s_endpgm
364  store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 2
365  ret void
366}
367
368define amdgpu_kernel void @store_lds_v4i32_align4(<4 x i32> addrspace(3)* %out, <4 x i32> %x) {
369; GFX9-LABEL: store_lds_v4i32_align4:
370; GFX9:       ; %bb.0:
371; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x24
372; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
373; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
374; GFX9-NEXT:    v_mov_b32_e32 v0, s2
375; GFX9-NEXT:    v_mov_b32_e32 v1, s4
376; GFX9-NEXT:    v_mov_b32_e32 v2, s5
377; GFX9-NEXT:    ds_write2_b32 v0, v1, v2 offset1:1
378; GFX9-NEXT:    v_mov_b32_e32 v3, s6
379; GFX9-NEXT:    v_mov_b32_e32 v1, s7
380; GFX9-NEXT:    ds_write2_b32 v0, v3, v1 offset0:2 offset1:3
381; GFX9-NEXT:    s_endpgm
382;
383; GFX7-LABEL: store_lds_v4i32_align4:
384; GFX7:       ; %bb.0:
385; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x9
386; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
387; GFX7-NEXT:    s_mov_b32 m0, -1
388; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
389; GFX7-NEXT:    v_mov_b32_e32 v0, s4
390; GFX7-NEXT:    v_mov_b32_e32 v1, s0
391; GFX7-NEXT:    v_mov_b32_e32 v2, s1
392; GFX7-NEXT:    ds_write2_b32 v0, v1, v2 offset1:1
393; GFX7-NEXT:    v_mov_b32_e32 v1, s2
394; GFX7-NEXT:    v_mov_b32_e32 v2, s3
395; GFX7-NEXT:    ds_write2_b32 v0, v1, v2 offset0:2 offset1:3
396; GFX7-NEXT:    s_endpgm
397;
398; GFX6-LABEL: store_lds_v4i32_align4:
399; GFX6:       ; %bb.0:
400; GFX6-NEXT:    s_load_dword s4, s[0:1], 0x9
401; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
402; GFX6-NEXT:    s_mov_b32 m0, -1
403; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
404; GFX6-NEXT:    v_mov_b32_e32 v0, s4
405; GFX6-NEXT:    v_mov_b32_e32 v1, s1
406; GFX6-NEXT:    v_mov_b32_e32 v2, s0
407; GFX6-NEXT:    ds_write2_b32 v0, v2, v1 offset1:1
408; GFX6-NEXT:    v_mov_b32_e32 v1, s3
409; GFX6-NEXT:    v_mov_b32_e32 v2, s2
410; GFX6-NEXT:    ds_write2_b32 v0, v2, v1 offset0:2 offset1:3
411; GFX6-NEXT:    s_endpgm
412;
413; GFX10-LABEL: store_lds_v4i32_align4:
414; GFX10:       ; %bb.0:
415; GFX10-NEXT:    s_clause 0x1
416; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x24
417; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
418; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
419; GFX10-NEXT:    v_mov_b32_e32 v0, s2
420; GFX10-NEXT:    v_mov_b32_e32 v1, s4
421; GFX10-NEXT:    v_mov_b32_e32 v2, s5
422; GFX10-NEXT:    v_mov_b32_e32 v3, s6
423; GFX10-NEXT:    v_mov_b32_e32 v4, s7
424; GFX10-NEXT:    ds_write2_b32 v0, v1, v2 offset1:1
425; GFX10-NEXT:    ds_write2_b32 v0, v3, v4 offset0:2 offset1:3
426; GFX10-NEXT:    s_endpgm
427  store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 4
428  ret void
429}
430
431define amdgpu_kernel void @store_lds_v4i32_align8(<4 x i32> addrspace(3)* %out, <4 x i32> %x) {
432; GFX9-LABEL: store_lds_v4i32_align8:
433; GFX9:       ; %bb.0:
434; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x24
435; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
436; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
437; GFX9-NEXT:    v_mov_b32_e32 v4, s2
438; GFX9-NEXT:    v_mov_b32_e32 v0, s4
439; GFX9-NEXT:    v_mov_b32_e32 v2, s6
440; GFX9-NEXT:    v_mov_b32_e32 v1, s5
441; GFX9-NEXT:    v_mov_b32_e32 v3, s7
442; GFX9-NEXT:    ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
443; GFX9-NEXT:    s_endpgm
444;
445; GFX7-LABEL: store_lds_v4i32_align8:
446; GFX7:       ; %bb.0:
447; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x9
448; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
449; GFX7-NEXT:    s_mov_b32 m0, -1
450; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
451; GFX7-NEXT:    v_mov_b32_e32 v4, s4
452; GFX7-NEXT:    v_mov_b32_e32 v0, s0
453; GFX7-NEXT:    v_mov_b32_e32 v2, s2
454; GFX7-NEXT:    v_mov_b32_e32 v1, s1
455; GFX7-NEXT:    v_mov_b32_e32 v3, s3
456; GFX7-NEXT:    ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
457; GFX7-NEXT:    s_endpgm
458;
459; GFX6-LABEL: store_lds_v4i32_align8:
460; GFX6:       ; %bb.0:
461; GFX6-NEXT:    s_load_dword s4, s[0:1], 0x9
462; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
463; GFX6-NEXT:    s_mov_b32 m0, -1
464; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
465; GFX6-NEXT:    v_mov_b32_e32 v4, s4
466; GFX6-NEXT:    v_mov_b32_e32 v0, s2
467; GFX6-NEXT:    v_mov_b32_e32 v1, s3
468; GFX6-NEXT:    v_mov_b32_e32 v2, s0
469; GFX6-NEXT:    v_mov_b32_e32 v3, s1
470; GFX6-NEXT:    ds_write2_b64 v4, v[2:3], v[0:1] offset1:1
471; GFX6-NEXT:    s_endpgm
472;
473; GFX10-LABEL: store_lds_v4i32_align8:
474; GFX10:       ; %bb.0:
475; GFX10-NEXT:    s_clause 0x1
476; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x24
477; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
478; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
479; GFX10-NEXT:    v_mov_b32_e32 v4, s2
480; GFX10-NEXT:    v_mov_b32_e32 v0, s4
481; GFX10-NEXT:    v_mov_b32_e32 v2, s6
482; GFX10-NEXT:    v_mov_b32_e32 v1, s5
483; GFX10-NEXT:    v_mov_b32_e32 v3, s7
484; GFX10-NEXT:    ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
485; GFX10-NEXT:    s_endpgm
486  store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 8
487  ret void
488}
489
490define amdgpu_kernel void @store_lds_v4i32_align16(<4 x i32> addrspace(3)* %out, <4 x i32> %x) {
491; GFX9-LABEL: store_lds_v4i32_align16:
492; GFX9:       ; %bb.0:
493; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x24
494; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
495; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
496; GFX9-NEXT:    v_mov_b32_e32 v4, s2
497; GFX9-NEXT:    v_mov_b32_e32 v0, s4
498; GFX9-NEXT:    v_mov_b32_e32 v1, s5
499; GFX9-NEXT:    v_mov_b32_e32 v2, s6
500; GFX9-NEXT:    v_mov_b32_e32 v3, s7
501; GFX9-NEXT:    ds_write_b128 v4, v[0:3]
502; GFX9-NEXT:    s_endpgm
503;
504; GFX7-LABEL: store_lds_v4i32_align16:
505; GFX7:       ; %bb.0:
506; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x9
507; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
508; GFX7-NEXT:    s_mov_b32 m0, -1
509; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
510; GFX7-NEXT:    v_mov_b32_e32 v4, s4
511; GFX7-NEXT:    v_mov_b32_e32 v0, s0
512; GFX7-NEXT:    v_mov_b32_e32 v1, s1
513; GFX7-NEXT:    v_mov_b32_e32 v2, s2
514; GFX7-NEXT:    v_mov_b32_e32 v3, s3
515; GFX7-NEXT:    ds_write_b128 v4, v[0:3]
516; GFX7-NEXT:    s_endpgm
517;
518; GFX6-LABEL: store_lds_v4i32_align16:
519; GFX6:       ; %bb.0:
520; GFX6-NEXT:    s_load_dword s4, s[0:1], 0x9
521; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
522; GFX6-NEXT:    s_mov_b32 m0, -1
523; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
524; GFX6-NEXT:    v_mov_b32_e32 v4, s4
525; GFX6-NEXT:    v_mov_b32_e32 v0, s2
526; GFX6-NEXT:    v_mov_b32_e32 v1, s3
527; GFX6-NEXT:    v_mov_b32_e32 v2, s0
528; GFX6-NEXT:    v_mov_b32_e32 v3, s1
529; GFX6-NEXT:    ds_write2_b64 v4, v[2:3], v[0:1] offset1:1
530; GFX6-NEXT:    s_endpgm
531;
532; GFX10-LABEL: store_lds_v4i32_align16:
533; GFX10:       ; %bb.0:
534; GFX10-NEXT:    s_clause 0x1
535; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x24
536; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
537; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
538; GFX10-NEXT:    v_mov_b32_e32 v4, s2
539; GFX10-NEXT:    v_mov_b32_e32 v0, s4
540; GFX10-NEXT:    v_mov_b32_e32 v1, s5
541; GFX10-NEXT:    v_mov_b32_e32 v2, s6
542; GFX10-NEXT:    v_mov_b32_e32 v3, s7
543; GFX10-NEXT:    ds_write_b128 v4, v[0:3]
544; GFX10-NEXT:    s_endpgm
545  store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 16
546  ret void
547}
548