1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
3; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s
4; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
5
6; FIXME:
7; XUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefix=GFX6 %s
8
9define <4 x i32> @load_lds_v4i32(<4 x i32> addrspace(3)* %ptr) {
10; GFX9-LABEL: load_lds_v4i32:
11; GFX9:       ; %bb.0:
12; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13; GFX9-NEXT:    ds_read_b128 v[0:3], v0
14; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
15; GFX9-NEXT:    s_setpc_b64 s[30:31]
16;
17; GFX7-LABEL: load_lds_v4i32:
18; GFX7:       ; %bb.0:
19; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
20; GFX7-NEXT:    s_mov_b32 m0, -1
21; GFX7-NEXT:    ds_read_b128 v[0:3], v0
22; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
23; GFX7-NEXT:    s_setpc_b64 s[30:31]
24;
25; GFX10-LABEL: load_lds_v4i32:
26; GFX10:       ; %bb.0:
27; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
29; GFX10-NEXT:    ds_read_b128 v[0:3], v0
30; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
31; GFX10-NEXT:    s_setpc_b64 s[30:31]
32  %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr
33  ret <4 x i32> %load
34}
35
36define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) {
37; GFX9-LABEL: load_lds_v4i32_align1:
38; GFX9:       ; %bb.0:
39; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40; GFX9-NEXT:    ds_read_u8 v1, v0
41; GFX9-NEXT:    ds_read_u8 v2, v0 offset:1
42; GFX9-NEXT:    ds_read_u8 v4, v0 offset:2
43; GFX9-NEXT:    ds_read_u8 v5, v0 offset:3
44; GFX9-NEXT:    ds_read_u8 v6, v0 offset:4
45; GFX9-NEXT:    ds_read_u8 v7, v0 offset:5
46; GFX9-NEXT:    ds_read_u8 v8, v0 offset:6
47; GFX9-NEXT:    ds_read_u8 v9, v0 offset:7
48; GFX9-NEXT:    s_mov_b32 s5, 8
49; GFX9-NEXT:    s_movk_i32 s4, 0xff
50; GFX9-NEXT:    s_waitcnt lgkmcnt(6)
51; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
52; GFX9-NEXT:    v_and_or_b32 v1, v1, s4, v2
53; GFX9-NEXT:    s_waitcnt lgkmcnt(5)
54; GFX9-NEXT:    v_and_b32_e32 v2, s4, v4
55; GFX9-NEXT:    s_waitcnt lgkmcnt(4)
56; GFX9-NEXT:    v_and_b32_e32 v4, s4, v5
57; GFX9-NEXT:    v_mov_b32_e32 v3, 0xff
58; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
59; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
60; GFX9-NEXT:    v_or3_b32 v4, v1, v2, v4
61; GFX9-NEXT:    s_waitcnt lgkmcnt(2)
62; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
63; GFX9-NEXT:    s_waitcnt lgkmcnt(1)
64; GFX9-NEXT:    v_and_b32_e32 v2, v8, v3
65; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
66; GFX9-NEXT:    v_and_b32_e32 v5, v9, v3
67; GFX9-NEXT:    v_and_or_b32 v1, v6, s4, v1
68; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
69; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
70; GFX9-NEXT:    v_or3_b32 v1, v1, v2, v5
71; GFX9-NEXT:    ds_read_u8 v2, v0 offset:8
72; GFX9-NEXT:    ds_read_u8 v6, v0 offset:9
73; GFX9-NEXT:    ds_read_u8 v7, v0 offset:10
74; GFX9-NEXT:    ds_read_u8 v8, v0 offset:11
75; GFX9-NEXT:    ds_read_u8 v9, v0 offset:12
76; GFX9-NEXT:    ds_read_u8 v10, v0 offset:13
77; GFX9-NEXT:    ds_read_u8 v11, v0 offset:14
78; GFX9-NEXT:    ds_read_u8 v0, v0 offset:15
79; GFX9-NEXT:    v_mov_b32_e32 v5, 8
80; GFX9-NEXT:    s_waitcnt lgkmcnt(6)
81; GFX9-NEXT:    v_lshlrev_b32_sdwa v6, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
82; GFX9-NEXT:    v_and_or_b32 v2, v2, v3, v6
83; GFX9-NEXT:    s_waitcnt lgkmcnt(5)
84; GFX9-NEXT:    v_and_b32_e32 v6, v7, v3
85; GFX9-NEXT:    s_waitcnt lgkmcnt(4)
86; GFX9-NEXT:    v_and_b32_e32 v7, v8, v3
87; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
88; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
89; GFX9-NEXT:    s_waitcnt lgkmcnt(2)
90; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, v5, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
91; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
92; GFX9-NEXT:    v_and_b32_e32 v0, v0, v3
93; GFX9-NEXT:    v_or3_b32 v2, v2, v6, v7
94; GFX9-NEXT:    v_and_b32_e32 v6, v11, v3
95; GFX9-NEXT:    v_and_or_b32 v5, v9, v3, v5
96; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
97; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
98; GFX9-NEXT:    v_or3_b32 v3, v5, v6, v0
99; GFX9-NEXT:    v_mov_b32_e32 v0, v4
100; GFX9-NEXT:    s_setpc_b64 s[30:31]
101;
102; GFX7-LABEL: load_lds_v4i32_align1:
103; GFX7:       ; %bb.0:
104; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
105; GFX7-NEXT:    s_mov_b32 m0, -1
106; GFX7-NEXT:    s_movk_i32 s4, 0xff
107; GFX7-NEXT:    ds_read_u8 v1, v0
108; GFX7-NEXT:    ds_read_u8 v2, v0 offset:1
109; GFX7-NEXT:    ds_read_u8 v4, v0 offset:2
110; GFX7-NEXT:    ds_read_u8 v5, v0 offset:3
111; GFX7-NEXT:    ds_read_u8 v6, v0 offset:4
112; GFX7-NEXT:    ds_read_u8 v7, v0 offset:5
113; GFX7-NEXT:    ds_read_u8 v8, v0 offset:6
114; GFX7-NEXT:    ds_read_u8 v9, v0 offset:7
115; GFX7-NEXT:    s_waitcnt lgkmcnt(6)
116; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
117; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
118; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
119; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
120; GFX7-NEXT:    s_waitcnt lgkmcnt(5)
121; GFX7-NEXT:    v_and_b32_e32 v2, s4, v4
122; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
123; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
124; GFX7-NEXT:    s_waitcnt lgkmcnt(4)
125; GFX7-NEXT:    v_and_b32_e32 v2, s4, v5
126; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
127; GFX7-NEXT:    v_mov_b32_e32 v3, 0xff
128; GFX7-NEXT:    v_or_b32_e32 v4, v1, v2
129; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
130; GFX7-NEXT:    v_and_b32_e32 v2, v7, v3
131; GFX7-NEXT:    v_and_b32_e32 v1, s4, v6
132; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
133; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
134; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
135; GFX7-NEXT:    v_and_b32_e32 v2, v8, v3
136; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
137; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
138; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
139; GFX7-NEXT:    v_and_b32_e32 v2, v9, v3
140; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
141; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
142; GFX7-NEXT:    ds_read_u8 v2, v0 offset:8
143; GFX7-NEXT:    ds_read_u8 v5, v0 offset:9
144; GFX7-NEXT:    ds_read_u8 v6, v0 offset:10
145; GFX7-NEXT:    ds_read_u8 v7, v0 offset:11
146; GFX7-NEXT:    ds_read_u8 v8, v0 offset:12
147; GFX7-NEXT:    ds_read_u8 v9, v0 offset:13
148; GFX7-NEXT:    ds_read_u8 v10, v0 offset:14
149; GFX7-NEXT:    ds_read_u8 v0, v0 offset:15
150; GFX7-NEXT:    s_waitcnt lgkmcnt(6)
151; GFX7-NEXT:    v_and_b32_e32 v5, v5, v3
152; GFX7-NEXT:    v_and_b32_e32 v2, v2, v3
153; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
154; GFX7-NEXT:    v_or_b32_e32 v2, v2, v5
155; GFX7-NEXT:    s_waitcnt lgkmcnt(5)
156; GFX7-NEXT:    v_and_b32_e32 v5, v6, v3
157; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
158; GFX7-NEXT:    v_or_b32_e32 v2, v2, v5
159; GFX7-NEXT:    s_waitcnt lgkmcnt(4)
160; GFX7-NEXT:    v_and_b32_e32 v5, v7, v3
161; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
162; GFX7-NEXT:    v_and_b32_e32 v6, v9, v3
163; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
164; GFX7-NEXT:    v_or_b32_e32 v2, v2, v5
165; GFX7-NEXT:    v_and_b32_e32 v5, v8, v3
166; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
167; GFX7-NEXT:    v_or_b32_e32 v5, v5, v6
168; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
169; GFX7-NEXT:    v_and_b32_e32 v6, v10, v3
170; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
171; GFX7-NEXT:    v_and_b32_e32 v0, v0, v3
172; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
173; GFX7-NEXT:    v_or_b32_e32 v5, v5, v6
174; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
175; GFX7-NEXT:    v_or_b32_e32 v3, v5, v0
176; GFX7-NEXT:    v_mov_b32_e32 v0, v4
177; GFX7-NEXT:    s_setpc_b64 s[30:31]
178;
179; GFX10-LABEL: load_lds_v4i32_align1:
180; GFX10:       ; %bb.0:
181; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
182; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
183; GFX10-NEXT:    ds_read_u8 v1, v0 offset:1
184; GFX10-NEXT:    ds_read_u8 v2, v0 offset:2
185; GFX10-NEXT:    ds_read_u8 v3, v0 offset:3
186; GFX10-NEXT:    ds_read_u8 v4, v0 offset:5
187; GFX10-NEXT:    ds_read_u8 v5, v0 offset:6
188; GFX10-NEXT:    ds_read_u8 v6, v0 offset:7
189; GFX10-NEXT:    ds_read_u8 v7, v0 offset:9
190; GFX10-NEXT:    ds_read_u8 v8, v0
191; GFX10-NEXT:    ds_read_u8 v9, v0 offset:4
192; GFX10-NEXT:    ds_read_u8 v10, v0 offset:8
193; GFX10-NEXT:    ds_read_u8 v12, v0 offset:10
194; GFX10-NEXT:    ds_read_u8 v13, v0 offset:11
195; GFX10-NEXT:    ds_read_u8 v14, v0 offset:12
196; GFX10-NEXT:    ds_read_u8 v15, v0 offset:13
197; GFX10-NEXT:    ds_read_u8 v16, v0 offset:14
198; GFX10-NEXT:    ds_read_u8 v0, v0 offset:15
199; GFX10-NEXT:    v_mov_b32_e32 v17, 8
200; GFX10-NEXT:    s_mov_b32 s5, 8
201; GFX10-NEXT:    v_mov_b32_e32 v11, 0xff
202; GFX10-NEXT:    s_movk_i32 s4, 0xff
203; GFX10-NEXT:    s_waitcnt lgkmcnt(15)
204; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
205; GFX10-NEXT:    s_waitcnt lgkmcnt(14)
206; GFX10-NEXT:    v_and_b32_e32 v2, s4, v2
207; GFX10-NEXT:    s_waitcnt lgkmcnt(13)
208; GFX10-NEXT:    v_and_b32_e32 v3, s4, v3
209; GFX10-NEXT:    s_waitcnt lgkmcnt(12)
210; GFX10-NEXT:    v_lshlrev_b32_sdwa v4, s5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
211; GFX10-NEXT:    s_waitcnt lgkmcnt(11)
212; GFX10-NEXT:    v_and_b32_e32 v5, v5, v11
213; GFX10-NEXT:    s_waitcnt lgkmcnt(10)
214; GFX10-NEXT:    v_and_b32_e32 v6, v6, v11
215; GFX10-NEXT:    s_waitcnt lgkmcnt(9)
216; GFX10-NEXT:    v_lshlrev_b32_sdwa v7, v17, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
217; GFX10-NEXT:    s_waitcnt lgkmcnt(8)
218; GFX10-NEXT:    v_and_or_b32 v1, v8, s4, v1
219; GFX10-NEXT:    s_waitcnt lgkmcnt(5)
220; GFX10-NEXT:    v_and_b32_e32 v8, v12, v11
221; GFX10-NEXT:    v_and_or_b32 v4, v9, s4, v4
222; GFX10-NEXT:    s_waitcnt lgkmcnt(4)
223; GFX10-NEXT:    v_and_b32_e32 v9, v13, v11
224; GFX10-NEXT:    v_and_or_b32 v7, v10, v11, v7
225; GFX10-NEXT:    s_waitcnt lgkmcnt(2)
226; GFX10-NEXT:    v_lshlrev_b32_sdwa v10, v17, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
227; GFX10-NEXT:    s_waitcnt lgkmcnt(1)
228; GFX10-NEXT:    v_and_b32_e32 v12, v16, v11
229; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
230; GFX10-NEXT:    v_and_b32_e32 v0, v0, v11
231; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
232; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
233; GFX10-NEXT:    v_and_or_b32 v10, v14, v11, v10
234; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
235; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 24, v0
236; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
237; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
238; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
239; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
240; GFX10-NEXT:    v_or3_b32 v0, v1, v2, v3
241; GFX10-NEXT:    v_or3_b32 v3, v10, v11, v12
242; GFX10-NEXT:    v_or3_b32 v1, v4, v5, v6
243; GFX10-NEXT:    v_or3_b32 v2, v7, v8, v9
244; GFX10-NEXT:    s_setpc_b64 s[30:31]
245  %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 1
246  ret <4 x i32> %load
247}
248
249define <4 x i32> @load_lds_v4i32_align2(<4 x i32> addrspace(3)* %ptr) {
250; GFX9-LABEL: load_lds_v4i32_align2:
251; GFX9:       ; %bb.0:
252; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
253; GFX9-NEXT:    s_mov_b32 s4, 0xffff
254; GFX9-NEXT:    ds_read_u16 v1, v0
255; GFX9-NEXT:    ds_read_u16 v2, v0 offset:2
256; GFX9-NEXT:    ds_read_u16 v3, v0 offset:4
257; GFX9-NEXT:    ds_read_u16 v4, v0 offset:6
258; GFX9-NEXT:    ds_read_u16 v5, v0 offset:8
259; GFX9-NEXT:    ds_read_u16 v6, v0 offset:10
260; GFX9-NEXT:    ds_read_u16 v7, v0 offset:12
261; GFX9-NEXT:    ds_read_u16 v8, v0 offset:14
262; GFX9-NEXT:    s_waitcnt lgkmcnt(6)
263; GFX9-NEXT:    v_and_b32_e32 v0, s4, v2
264; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
265; GFX9-NEXT:    v_and_or_b32 v0, v1, s4, v0
266; GFX9-NEXT:    s_waitcnt lgkmcnt(4)
267; GFX9-NEXT:    v_and_b32_e32 v1, s4, v4
268; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
269; GFX9-NEXT:    s_waitcnt lgkmcnt(2)
270; GFX9-NEXT:    v_and_b32_e32 v2, s4, v6
271; GFX9-NEXT:    v_and_or_b32 v1, v3, s4, v1
272; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
273; GFX9-NEXT:    v_and_b32_e32 v3, s4, v8
274; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
275; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
276; GFX9-NEXT:    v_and_or_b32 v2, v5, s4, v2
277; GFX9-NEXT:    v_and_or_b32 v3, v7, s4, v3
278; GFX9-NEXT:    s_setpc_b64 s[30:31]
279;
280; GFX7-LABEL: load_lds_v4i32_align2:
281; GFX7:       ; %bb.0:
282; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
283; GFX7-NEXT:    s_mov_b32 m0, -1
284; GFX7-NEXT:    ds_read_u16 v1, v0
285; GFX7-NEXT:    ds_read_u16 v2, v0 offset:2
286; GFX7-NEXT:    ds_read_u16 v3, v0 offset:4
287; GFX7-NEXT:    ds_read_u16 v4, v0 offset:6
288; GFX7-NEXT:    ds_read_u16 v5, v0 offset:8
289; GFX7-NEXT:    ds_read_u16 v6, v0 offset:10
290; GFX7-NEXT:    ds_read_u16 v7, v0 offset:12
291; GFX7-NEXT:    ds_read_u16 v8, v0 offset:14
292; GFX7-NEXT:    s_mov_b32 s4, 0xffff
293; GFX7-NEXT:    s_waitcnt lgkmcnt(7)
294; GFX7-NEXT:    v_and_b32_e32 v0, s4, v1
295; GFX7-NEXT:    s_waitcnt lgkmcnt(6)
296; GFX7-NEXT:    v_and_b32_e32 v1, s4, v2
297; GFX7-NEXT:    s_waitcnt lgkmcnt(4)
298; GFX7-NEXT:    v_and_b32_e32 v2, s4, v4
299; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
300; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
301; GFX7-NEXT:    v_and_b32_e32 v1, s4, v3
302; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
303; GFX7-NEXT:    v_and_b32_e32 v3, s4, v6
304; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
305; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
306; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff, v8
307; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
308; GFX7-NEXT:    v_and_b32_e32 v2, s4, v5
309; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
310; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
311; GFX7-NEXT:    v_and_b32_e32 v3, s4, v7
312; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
313; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
314; GFX7-NEXT:    s_setpc_b64 s[30:31]
315;
316; GFX10-LABEL: load_lds_v4i32_align2:
317; GFX10:       ; %bb.0:
318; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
319; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
320; GFX10-NEXT:    ds_read_u16 v1, v0 offset:2
321; GFX10-NEXT:    ds_read_u16 v2, v0 offset:6
322; GFX10-NEXT:    ds_read_u16 v3, v0 offset:10
323; GFX10-NEXT:    ds_read_u16 v4, v0 offset:14
324; GFX10-NEXT:    ds_read_u16 v5, v0
325; GFX10-NEXT:    ds_read_u16 v6, v0 offset:4
326; GFX10-NEXT:    ds_read_u16 v7, v0 offset:8
327; GFX10-NEXT:    ds_read_u16 v8, v0 offset:12
328; GFX10-NEXT:    s_mov_b32 s4, 0xffff
329; GFX10-NEXT:    s_waitcnt lgkmcnt(7)
330; GFX10-NEXT:    v_and_b32_e32 v0, s4, v1
331; GFX10-NEXT:    s_waitcnt lgkmcnt(6)
332; GFX10-NEXT:    v_and_b32_e32 v1, s4, v2
333; GFX10-NEXT:    s_waitcnt lgkmcnt(5)
334; GFX10-NEXT:    v_and_b32_e32 v2, s4, v3
335; GFX10-NEXT:    s_waitcnt lgkmcnt(4)
336; GFX10-NEXT:    v_and_b32_e32 v3, s4, v4
337; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
338; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
339; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
340; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
341; GFX10-NEXT:    s_waitcnt lgkmcnt(3)
342; GFX10-NEXT:    v_and_or_b32 v0, v5, s4, v0
343; GFX10-NEXT:    s_waitcnt lgkmcnt(2)
344; GFX10-NEXT:    v_and_or_b32 v1, v6, s4, v1
345; GFX10-NEXT:    s_waitcnt lgkmcnt(1)
346; GFX10-NEXT:    v_and_or_b32 v2, v7, s4, v2
347; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
348; GFX10-NEXT:    v_and_or_b32 v3, v8, s4, v3
349; GFX10-NEXT:    s_setpc_b64 s[30:31]
350  %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 2
351  ret <4 x i32> %load
352}
353
354define <4 x i32> @load_lds_v4i32_align4(<4 x i32> addrspace(3)* %ptr) {
355; GFX9-LABEL: load_lds_v4i32_align4:
356; GFX9:       ; %bb.0:
357; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
358; GFX9-NEXT:    v_mov_b32_e32 v2, v0
359; GFX9-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
360; GFX9-NEXT:    ds_read2_b32 v[2:3], v2 offset0:2 offset1:3
361; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
362; GFX9-NEXT:    s_setpc_b64 s[30:31]
363;
364; GFX7-LABEL: load_lds_v4i32_align4:
365; GFX7:       ; %bb.0:
366; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
367; GFX7-NEXT:    v_mov_b32_e32 v2, v0
368; GFX7-NEXT:    s_mov_b32 m0, -1
369; GFX7-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
370; GFX7-NEXT:    ds_read2_b32 v[2:3], v2 offset0:2 offset1:3
371; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
372; GFX7-NEXT:    s_setpc_b64 s[30:31]
373;
374; GFX10-LABEL: load_lds_v4i32_align4:
375; GFX10:       ; %bb.0:
376; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
377; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
378; GFX10-NEXT:    v_mov_b32_e32 v2, v0
379; GFX10-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
380; GFX10-NEXT:    ds_read2_b32 v[2:3], v2 offset0:2 offset1:3
381; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
382; GFX10-NEXT:    s_setpc_b64 s[30:31]
383  %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 4
384  ret <4 x i32> %load
385}
386
387define <4 x i32> @load_lds_v4i32_align8(<4 x i32> addrspace(3)* %ptr) {
388; GFX9-LABEL: load_lds_v4i32_align8:
389; GFX9:       ; %bb.0:
390; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
391; GFX9-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
392; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
393; GFX9-NEXT:    s_setpc_b64 s[30:31]
394;
395; GFX7-LABEL: load_lds_v4i32_align8:
396; GFX7:       ; %bb.0:
397; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
398; GFX7-NEXT:    s_mov_b32 m0, -1
399; GFX7-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
400; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
401; GFX7-NEXT:    s_setpc_b64 s[30:31]
402;
403; GFX10-LABEL: load_lds_v4i32_align8:
404; GFX10:       ; %bb.0:
405; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
406; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
407; GFX10-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
408; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
409; GFX10-NEXT:    s_setpc_b64 s[30:31]
410  %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 8
411  ret <4 x i32> %load
412}
413
414define <4 x i32> @load_lds_v4i32_align16(<4 x i32> addrspace(3)* %ptr) {
415; GFX9-LABEL: load_lds_v4i32_align16:
416; GFX9:       ; %bb.0:
417; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
418; GFX9-NEXT:    ds_read_b128 v[0:3], v0
419; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
420; GFX9-NEXT:    s_setpc_b64 s[30:31]
421;
422; GFX7-LABEL: load_lds_v4i32_align16:
423; GFX7:       ; %bb.0:
424; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
425; GFX7-NEXT:    s_mov_b32 m0, -1
426; GFX7-NEXT:    ds_read_b128 v[0:3], v0
427; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
428; GFX7-NEXT:    s_setpc_b64 s[30:31]
429;
430; GFX10-LABEL: load_lds_v4i32_align16:
431; GFX10:       ; %bb.0:
432; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
433; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
434; GFX10-NEXT:    ds_read_b128 v[0:3], v0
435; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
436; GFX10-NEXT:    s_setpc_b64 s[30:31]
437  %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 16
438  ret <4 x i32> %load
439}
440