1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+unaligned-access-mode -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
3; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -mattr=+unaligned-access-mode -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s
4
5; Unaligned DS access in available from GFX9 onwards.
6; LDS alignment enforcement is controlled by a configuration register:
7; SH_MEM_CONFIG.alignment_mode
8
9define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) {
10; GFX9-LABEL: load_lds_v4i32_align1:
11; GFX9:       ; %bb.0:
12; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13; GFX9-NEXT:    ds_read_b128 v[0:3], v0
14; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
15; GFX9-NEXT:    s_setpc_b64 s[30:31]
16;
17; GFX7-LABEL: load_lds_v4i32_align1:
18; GFX7:       ; %bb.0:
19; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
20; GFX7-NEXT:    s_mov_b32 m0, -1
21; GFX7-NEXT:    s_movk_i32 s4, 0xff
22; GFX7-NEXT:    ds_read_u8 v1, v0
23; GFX7-NEXT:    ds_read_u8 v2, v0 offset:1
24; GFX7-NEXT:    ds_read_u8 v4, v0 offset:2
25; GFX7-NEXT:    ds_read_u8 v5, v0 offset:3
26; GFX7-NEXT:    ds_read_u8 v6, v0 offset:4
27; GFX7-NEXT:    ds_read_u8 v7, v0 offset:5
28; GFX7-NEXT:    ds_read_u8 v8, v0 offset:6
29; GFX7-NEXT:    ds_read_u8 v9, v0 offset:7
30; GFX7-NEXT:    s_waitcnt lgkmcnt(6)
31; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
32; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
33; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
34; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
35; GFX7-NEXT:    s_waitcnt lgkmcnt(5)
36; GFX7-NEXT:    v_and_b32_e32 v2, s4, v4
37; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
38; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
39; GFX7-NEXT:    s_waitcnt lgkmcnt(4)
40; GFX7-NEXT:    v_and_b32_e32 v2, s4, v5
41; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
42; GFX7-NEXT:    v_mov_b32_e32 v3, 0xff
43; GFX7-NEXT:    v_or_b32_e32 v4, v1, v2
44; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
45; GFX7-NEXT:    v_and_b32_e32 v2, v7, v3
46; GFX7-NEXT:    v_and_b32_e32 v1, s4, v6
47; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
48; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
49; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
50; GFX7-NEXT:    v_and_b32_e32 v2, v8, v3
51; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
52; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
53; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
54; GFX7-NEXT:    v_and_b32_e32 v2, v9, v3
55; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
56; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
57; GFX7-NEXT:    ds_read_u8 v2, v0 offset:8
58; GFX7-NEXT:    ds_read_u8 v5, v0 offset:9
59; GFX7-NEXT:    ds_read_u8 v6, v0 offset:10
60; GFX7-NEXT:    ds_read_u8 v7, v0 offset:11
61; GFX7-NEXT:    ds_read_u8 v8, v0 offset:12
62; GFX7-NEXT:    ds_read_u8 v9, v0 offset:13
63; GFX7-NEXT:    ds_read_u8 v10, v0 offset:14
64; GFX7-NEXT:    ds_read_u8 v0, v0 offset:15
65; GFX7-NEXT:    s_waitcnt lgkmcnt(6)
66; GFX7-NEXT:    v_and_b32_e32 v5, v5, v3
67; GFX7-NEXT:    v_and_b32_e32 v2, v2, v3
68; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
69; GFX7-NEXT:    v_or_b32_e32 v2, v2, v5
70; GFX7-NEXT:    s_waitcnt lgkmcnt(5)
71; GFX7-NEXT:    v_and_b32_e32 v5, v6, v3
72; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
73; GFX7-NEXT:    v_or_b32_e32 v2, v2, v5
74; GFX7-NEXT:    s_waitcnt lgkmcnt(4)
75; GFX7-NEXT:    v_and_b32_e32 v5, v7, v3
76; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
77; GFX7-NEXT:    v_and_b32_e32 v6, v9, v3
78; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
79; GFX7-NEXT:    v_or_b32_e32 v2, v2, v5
80; GFX7-NEXT:    v_and_b32_e32 v5, v8, v3
81; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
82; GFX7-NEXT:    v_or_b32_e32 v5, v5, v6
83; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
84; GFX7-NEXT:    v_and_b32_e32 v6, v10, v3
85; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
86; GFX7-NEXT:    v_and_b32_e32 v0, v0, v3
87; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
88; GFX7-NEXT:    v_or_b32_e32 v5, v5, v6
89; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
90; GFX7-NEXT:    v_or_b32_e32 v3, v5, v0
91; GFX7-NEXT:    v_mov_b32_e32 v0, v4
92; GFX7-NEXT:    s_setpc_b64 s[30:31]
93  %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 1
94  ret <4 x i32> %load
95}
96
97define <3 x i32> @load_lds_v3i32_align1(<3 x i32> addrspace(3)* %ptr) {
98; GFX9-LABEL: load_lds_v3i32_align1:
99; GFX9:       ; %bb.0:
100; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
101; GFX9-NEXT:    ds_read_b96 v[0:2], v0
102; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
103; GFX9-NEXT:    s_setpc_b64 s[30:31]
104;
105; GFX7-LABEL: load_lds_v3i32_align1:
106; GFX7:       ; %bb.0:
107; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
108; GFX7-NEXT:    s_mov_b32 m0, -1
109; GFX7-NEXT:    v_mov_b32_e32 v2, v0
110; GFX7-NEXT:    s_movk_i32 s4, 0xff
111; GFX7-NEXT:    ds_read_u8 v0, v0
112; GFX7-NEXT:    ds_read_u8 v1, v2 offset:1
113; GFX7-NEXT:    ds_read_u8 v4, v2 offset:2
114; GFX7-NEXT:    ds_read_u8 v5, v2 offset:3
115; GFX7-NEXT:    ds_read_u8 v6, v2 offset:4
116; GFX7-NEXT:    ds_read_u8 v7, v2 offset:5
117; GFX7-NEXT:    ds_read_u8 v8, v2 offset:6
118; GFX7-NEXT:    ds_read_u8 v9, v2 offset:7
119; GFX7-NEXT:    s_waitcnt lgkmcnt(6)
120; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
121; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
122; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
123; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
124; GFX7-NEXT:    s_waitcnt lgkmcnt(5)
125; GFX7-NEXT:    v_and_b32_e32 v1, s4, v4
126; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
127; GFX7-NEXT:    v_mov_b32_e32 v3, 0xff
128; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
129; GFX7-NEXT:    s_waitcnt lgkmcnt(4)
130; GFX7-NEXT:    v_and_b32_e32 v1, s4, v5
131; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
132; GFX7-NEXT:    v_and_b32_e32 v4, v7, v3
133; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
134; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
135; GFX7-NEXT:    v_and_b32_e32 v1, s4, v6
136; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
137; GFX7-NEXT:    v_or_b32_e32 v1, v1, v4
138; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
139; GFX7-NEXT:    v_and_b32_e32 v4, v8, v3
140; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
141; GFX7-NEXT:    v_or_b32_e32 v1, v1, v4
142; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
143; GFX7-NEXT:    v_and_b32_e32 v4, v9, v3
144; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
145; GFX7-NEXT:    v_or_b32_e32 v1, v1, v4
146; GFX7-NEXT:    ds_read_u8 v4, v2 offset:8
147; GFX7-NEXT:    ds_read_u8 v5, v2 offset:9
148; GFX7-NEXT:    ds_read_u8 v6, v2 offset:10
149; GFX7-NEXT:    ds_read_u8 v2, v2 offset:11
150; GFX7-NEXT:    s_waitcnt lgkmcnt(3)
151; GFX7-NEXT:    v_and_b32_e32 v4, v4, v3
152; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
153; GFX7-NEXT:    v_and_b32_e32 v5, v5, v3
154; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
155; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
156; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
157; GFX7-NEXT:    v_and_b32_e32 v5, v6, v3
158; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
159; GFX7-NEXT:    v_and_b32_e32 v2, v2, v3
160; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
161; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
162; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
163; GFX7-NEXT:    v_or_b32_e32 v2, v4, v2
164; GFX7-NEXT:    s_setpc_b64 s[30:31]
165  %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 1
166  ret <3 x i32> %load
167}
168
169define void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out, <4 x i32> %x) {
170; GFX9-LABEL: store_lds_v4i32_align1:
171; GFX9:       ; %bb.0:
172; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
173; GFX9-NEXT:    ds_write_b128 v0, v[1:4]
174; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
175; GFX9-NEXT:    s_setpc_b64 s[30:31]
176;
177; GFX7-LABEL: store_lds_v4i32_align1:
178; GFX7:       ; %bb.0:
179; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
180; GFX7-NEXT:    s_mov_b32 m0, -1
181; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
182; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
183; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
184; GFX7-NEXT:    ds_write_b8 v0, v1
185; GFX7-NEXT:    ds_write_b8 v0, v5 offset:1
186; GFX7-NEXT:    ds_write_b8 v0, v6 offset:2
187; GFX7-NEXT:    ds_write_b8 v0, v7 offset:3
188; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 8, v2
189; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
190; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 24, v2
191; GFX7-NEXT:    ds_write_b8 v0, v2 offset:4
192; GFX7-NEXT:    ds_write_b8 v0, v1 offset:5
193; GFX7-NEXT:    ds_write_b8 v0, v5 offset:6
194; GFX7-NEXT:    ds_write_b8 v0, v6 offset:7
195; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 8, v3
196; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
197; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 24, v3
198; GFX7-NEXT:    ds_write_b8 v0, v3 offset:8
199; GFX7-NEXT:    ds_write_b8 v0, v1 offset:9
200; GFX7-NEXT:    ds_write_b8 v0, v2 offset:10
201; GFX7-NEXT:    ds_write_b8 v0, v5 offset:11
202; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 8, v4
203; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
204; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v4
205; GFX7-NEXT:    ds_write_b8 v0, v4 offset:12
206; GFX7-NEXT:    ds_write_b8 v0, v1 offset:13
207; GFX7-NEXT:    ds_write_b8 v0, v2 offset:14
208; GFX7-NEXT:    ds_write_b8 v0, v3 offset:15
209; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
210; GFX7-NEXT:    s_setpc_b64 s[30:31]
211  store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 1
212  ret void
213}
214
215define void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out, <3 x i32> %x) {
216; GFX9-LABEL: store_lds_v3i32_align1:
217; GFX9:       ; %bb.0:
218; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
219; GFX9-NEXT:    ds_write_b96 v0, v[1:3]
220; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
221; GFX9-NEXT:    s_setpc_b64 s[30:31]
222;
223; GFX7-LABEL: store_lds_v3i32_align1:
224; GFX7:       ; %bb.0:
225; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
226; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
227; GFX7-NEXT:    s_mov_b32 m0, -1
228; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
229; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 24, v1
230; GFX7-NEXT:    ds_write_b8 v0, v1
231; GFX7-NEXT:    ds_write_b8 v0, v4 offset:1
232; GFX7-NEXT:    ds_write_b8 v0, v5 offset:2
233; GFX7-NEXT:    ds_write_b8 v0, v6 offset:3
234; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 8, v2
235; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
236; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 24, v2
237; GFX7-NEXT:    ds_write_b8 v0, v2 offset:4
238; GFX7-NEXT:    ds_write_b8 v0, v1 offset:5
239; GFX7-NEXT:    ds_write_b8 v0, v4 offset:6
240; GFX7-NEXT:    ds_write_b8 v0, v5 offset:7
241; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 8, v3
242; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
243; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 24, v3
244; GFX7-NEXT:    ds_write_b8 v0, v3 offset:8
245; GFX7-NEXT:    ds_write_b8 v0, v1 offset:9
246; GFX7-NEXT:    ds_write_b8 v0, v2 offset:10
247; GFX7-NEXT:    ds_write_b8 v0, v4 offset:11
248; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
249; GFX7-NEXT:    s_setpc_b64 s[30:31]
250  store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 1
251  ret void
252}
253