1; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900 %s
2; RUN: llc -march=amdgcn -mcpu=gfx906 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX906,NO-D16-HI %s
3; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX803,NO-D16-HI %s
4
5; GCN-LABEL: {{^}}load_local_lo_hi_v2i16_multi_use_lo:
6; GFX900: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7; GFX900-NEXT: ds_read_u16 v2, v0
8; GFX900-NEXT: v_mov_b32_e32 v3, 0
9; GFX900-NEXT: s_waitcnt lgkmcnt(0)
10; GFX900-NEXT: v_mov_b32_e32 v1, v2
11; GFX900-NEXT: ds_read_u16_d16_hi v1, v0 offset:16
12; GFX900-NEXT: ds_write_b16 v3, v2
13; GFX900-NEXT: s_waitcnt lgkmcnt(1)
14; GFX900-NEXT: v_mov_b32_e32 v0, v1
15; GFX900-NEXT: s_waitcnt lgkmcnt(0)
16; GFX900-NEXT: s_setpc_b64 s[30:31]
17define <2 x i16> @load_local_lo_hi_v2i16_multi_use_lo(i16 addrspace(3)* noalias %in) #0 {
18entry:
19  %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 8
20  %load.lo = load i16, i16 addrspace(3)* %in
21  %load.hi = load i16, i16 addrspace(3)* %gep
22  store i16 %load.lo, i16 addrspace(3)* null
23  %build0 = insertelement <2 x i16> undef, i16 %load.lo, i32 0
24  %build1 = insertelement <2 x i16> %build0, i16 %load.hi, i32 1
25  ret <2 x i16> %build1
26}
27
28; GCN-LABEL: {{^}}load_local_lo_hi_v2i16_multi_use_hi:
29; GFX900: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30; GFX900-NEXT: ds_read_u16 v1, v0
31; GFX900-NEXT: ds_read_u16 v0, v0 offset:16
32; GFX900-NEXT: v_mov_b32_e32 v2, 0
33; GFX900-NEXT: s_waitcnt lgkmcnt(1)
34; GFX900-NEXT: v_and_b32_e32 v1, 0xffff, v1
35; GFX900-NEXT: s_waitcnt lgkmcnt(0)
36; GFX900-NEXT: ds_write_b16 v2, v0
37; GFX900-NEXT: v_lshl_or_b32 v0, v0, 16, v1
38; GFX900-NEXT: s_waitcnt lgkmcnt(0)
39; GFX900-NEXT: s_setpc_b64 s[30:31]
40define <2 x i16> @load_local_lo_hi_v2i16_multi_use_hi(i16 addrspace(3)* noalias %in) #0 {
41entry:
42  %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 8
43  %load.lo = load i16, i16 addrspace(3)* %in
44  %load.hi = load i16, i16 addrspace(3)* %gep
45  store i16 %load.hi, i16 addrspace(3)* null
46  %build0 = insertelement <2 x i16> undef, i16 %load.lo, i32 0
47  %build1 = insertelement <2 x i16> %build0, i16 %load.hi, i32 1
48  ret <2 x i16> %build1
49}
50
51; GCN-LABEL: {{^}}load_local_lo_hi_v2i16_multi_use_lohi:
52; GFX900: ds_read_u16 v3, v0
53; GFX900-NEXT: ds_read_u16 v0, v0 offset:16
54; GFX900-NEXT: s_waitcnt lgkmcnt(1)
55; GFX900-NEXT: ds_write_b16 v1, v3
56; GFX900-NEXT: s_waitcnt lgkmcnt(1)
57; GFX900-NEXT: ds_write_b16 v2, v0
58; GFX900-NEXT: v_and_b32_e32 v1, 0xffff, v3
59; GFX900-NEXT: v_lshl_or_b32 v0, v0, 16, v1
60; GFX900-NEXT: s_waitcnt lgkmcnt(0)
61; GFX900-NEXT: s_setpc_b64 s[30:31]
62define <2 x i16> @load_local_lo_hi_v2i16_multi_use_lohi(i16 addrspace(3)* noalias %in, i16 addrspace(3)* noalias %out0, i16 addrspace(3)* noalias %out1) #0 {
63entry:
64  %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 8
65  %load.lo = load i16, i16 addrspace(3)* %in
66  %load.hi = load i16, i16 addrspace(3)* %gep
67  store i16 %load.lo, i16 addrspace(3)* %out0
68  store i16 %load.hi, i16 addrspace(3)* %out1
69  %build0 = insertelement <2 x i16> undef, i16 %load.lo, i32 0
70  %build1 = insertelement <2 x i16> %build0, i16 %load.hi, i32 1
71  ret <2 x i16> %build1
72}
73
74; GCN-LABEL: {{^}}load_local_hi_v2i16_undeflo:
75; GCN: s_waitcnt
76; GFX900-NEXT: ds_read_u16_d16_hi v0, v0
77; GFX900-NEXT: s_waitcnt
78; GFX900-NEXT: s_setpc_b64
79
80; NO-D16-HI: ds_read_u16 v
81define <2 x i16> @load_local_hi_v2i16_undeflo(i16 addrspace(3)* %in) #0 {
82entry:
83  %load = load i16, i16 addrspace(3)* %in
84  %build = insertelement <2 x i16> undef, i16 %load, i32 1
85  ret <2 x i16> %build
86}
87
88; GCN-LABEL: {{^}}load_local_hi_v2i16_reglo:
89; GCN: s_waitcnt
90; GFX900-NEXT: ds_read_u16_d16_hi v1, v0
91; GFX900-NEXT: s_waitcnt
92; GFX900-NEXT: v_mov_b32_e32 v0, v1
93; GFX900-NEXT: s_setpc_b64
94
95; NO-D16-HI: ds_read_u16 v
96define <2 x i16> @load_local_hi_v2i16_reglo(i16 addrspace(3)* %in, i16 %reg) #0 {
97entry:
98  %load = load i16, i16 addrspace(3)* %in
99  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
100  %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
101  ret <2 x i16> %build1
102}
103
104; Show that we get reasonable regalloc without physreg constraints.
105; GCN-LABEL: {{^}}load_local_hi_v2i16_reglo_vreg:
106; GCN: s_waitcnt
107; GFX900-NEXT: ds_read_u16_d16_hi v1, v0
108; GFX900-NEXT: s_waitcnt
109; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}}
110; GFX900-NEXT: s_waitcnt
111; GFX900-NEXT: s_setpc_b64
112
113; NO-D16-HI: ds_read_u16 v
114define void @load_local_hi_v2i16_reglo_vreg(i16 addrspace(3)* %in, i16 %reg) #0 {
115entry:
116  %load = load i16, i16 addrspace(3)* %in
117  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
118  %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
119  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
120  ret void
121}
122
123; GCN-LABEL: {{^}}load_local_hi_v2i16_zerolo:
124; GCN: s_waitcnt
125; GFX900-NEXT: v_mov_b32_e32 v1, 0
126; GFX900-NEXT: ds_read_u16_d16_hi v1, v0
127; GFX900-NEXT: s_waitcnt
128; GFX900-NEXT: v_mov_b32_e32 v0, v1
129; GFX900-NEXT: s_setpc_b64
130
131; NO-D16-HI: ds_read_u16 v
132define <2 x i16> @load_local_hi_v2i16_zerolo(i16 addrspace(3)* %in) #0 {
133entry:
134  %load = load i16, i16 addrspace(3)* %in
135  %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 1
136  ret <2 x i16> %build
137}
138
139; FIXME: Remove m0 initialization
140; GCN-LABEL: {{^}}load_local_hi_v2i16_zerolo_shift:
141; GCN: s_waitcnt
142; GFX900-NEXT: ds_read_u16 v0, v0
143; GFX900-NEXT: s_waitcnt lgkmcnt(0)
144; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0
145; GFX900-NEXT: s_setpc_b64
146
147; NO-D16-HI: ds_read_u16 v
148; NO-D16-HI: v_lshlrev_b32_e32 v0, 16, v0
149define i32 @load_local_hi_v2i16_zerolo_shift(i16 addrspace(3)* %in) #0 {
150entry:
151  %load = load i16, i16 addrspace(3)* %in
152  %zext = zext i16 %load to i32
153  %shift = shl i32 %zext, 16
154  ret i32 %shift
155}
156
157; GCN-LABEL: {{^}}load_local_hi_v2f16_reglo_vreg:
158; GCN: s_waitcnt
159; GFX900-NEXT: ds_read_u16_d16_hi v1, v0
160; GFX900-NEXT: s_waitcnt
161; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}}
162; GFX900-NEXT: s_waitcnt
163; GFX900-NEXT: s_setpc_b64
164
165; NO-D16-HI: ds_read_u16 v
166define void @load_local_hi_v2f16_reglo_vreg(half addrspace(3)* %in, half %reg) #0 {
167entry:
168  %load = load half, half addrspace(3)* %in
169  %build0 = insertelement <2 x half> undef, half %reg, i32 0
170  %build1 = insertelement <2 x half> %build0, half %load, i32 1
171  store <2 x half> %build1, <2 x half> addrspace(1)* undef
172  ret void
173}
174
175; GCN-LABEL: {{^}}load_local_hi_v2i16_reglo_vreg_zexti8:
176; GCN: s_waitcnt
177; GFX900-NEXT: ds_read_u8_d16_hi v1, v0
178; GFX900-NEXT: s_waitcnt
179; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}}
180; GFX900-NEXT: s_waitcnt
181; GFX900-NEXT: s_setpc_b64
182
183; NO-D16-HI: ds_read_u8 v
184define void @load_local_hi_v2i16_reglo_vreg_zexti8(i8 addrspace(3)* %in, i16 %reg) #0 {
185entry:
186  %load = load i8, i8 addrspace(3)* %in
187  %ext = zext i8 %load to i16
188  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
189  %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
190  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
191  ret void
192}
193
194; GCN-LABEL: {{^}}load_local_hi_v2i16_reglo_vreg_sexti8:
195; GCN: s_waitcnt
196; GFX900-NEXT: ds_read_i8_d16_hi v1, v0
197; GFX900-NEXT: s_waitcnt
198; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}}
199; GFX900-NEXT: s_waitcnt
200; GFX900-NEXT: s_setpc_b64
201
202; NO-D16-HI: ds_read_i8 v
203define void @load_local_hi_v2i16_reglo_vreg_sexti8(i8 addrspace(3)* %in, i16 %reg) #0 {
204entry:
205  %load = load i8, i8 addrspace(3)* %in
206  %ext = sext i8 %load to i16
207  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
208  %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
209  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
210  ret void
211}
212
213; GCN-LABEL: {{^}}load_local_hi_v2f16_reglo_vreg_zexti8:
214; GCN: s_waitcnt
215; GFX900-NEXT: ds_read_u8_d16_hi v1, v0
216; GFX900-NEXT: s_waitcnt
217; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}}
218; GFX900-NEXT: s_waitcnt
219; GFX900-NEXT: s_setpc_b64
220
221; NO-D16-HI: ds_read_u8 v
222define void @load_local_hi_v2f16_reglo_vreg_zexti8(i8 addrspace(3)* %in, half %reg) #0 {
223entry:
224  %load = load i8, i8 addrspace(3)* %in
225  %ext = zext i8 %load to i16
226  %bitcast = bitcast i16 %ext to half
227
228  %build0 = insertelement <2 x half> undef, half %reg, i32 0
229  %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1
230  store <2 x half> %build1, <2 x half> addrspace(1)* undef
231  ret void
232}
233
234; GCN-LABEL: {{^}}load_local_hi_v2f16_reglo_vreg_sexti8:
235; GCN: s_waitcnt
236; GFX900-NEXT: ds_read_i8_d16_hi v1, v0
237; GFX900-NEXT: s_waitcnt
238; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}}
239; GFX900-NEXT: s_waitcnt
240; GFX900-NEXT: s_setpc_b64
241
242; NO-D16-HI: ds_read_i8 v
243define void @load_local_hi_v2f16_reglo_vreg_sexti8(i8 addrspace(3)* %in, half %reg) #0 {
244entry:
245  %load = load i8, i8 addrspace(3)* %in
246  %ext = sext i8 %load to i16
247  %bitcast = bitcast i16 %ext to half
248
249  %build0 = insertelement <2 x half> undef, half %reg, i32 0
250  %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1
251  store <2 x half> %build1, <2 x half> addrspace(1)* undef
252  ret void
253}
254
255; GCN-LABEL: {{^}}load_global_hi_v2i16_reglo_vreg:
256; GCN: s_waitcnt
257; GFX900-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094
258; GFX900-NEXT: s_waitcnt
259; GFX900-NEXT: global_store_dword
260; GFX900-NEXT: s_waitcnt
261; GFX900-NEXT: s_setpc_b64
262define void @load_global_hi_v2i16_reglo_vreg(i16 addrspace(1)* %in, i16 %reg) #0 {
263entry:
264  %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 -2047
265  %load = load i16, i16 addrspace(1)* %gep
266  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
267  %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
268  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
269  ret void
270}
271
272; GCN-LABEL: {{^}}load_global_hi_v2f16_reglo_vreg:
273; GCN: s_waitcnt
274; GFX900-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094
275; GFX900-NEXT: s_waitcnt
276; GFX900-NEXT: global_store_dword
277; GFX900-NEXT: s_waitcnt
278; GFX900-NEXT: s_setpc_b64
279define void @load_global_hi_v2f16_reglo_vreg(half addrspace(1)* %in, half %reg) #0 {
280entry:
281  %gep = getelementptr inbounds half, half addrspace(1)* %in, i64 -2047
282  %load = load half, half addrspace(1)* %gep
283  %build0 = insertelement <2 x half> undef, half %reg, i32 0
284  %build1 = insertelement <2 x half> %build0, half %load, i32 1
285  store <2 x half> %build1, <2 x half> addrspace(1)* undef
286  ret void
287}
288
289; GCN-LABEL: {{^}}load_global_hi_v2i16_reglo_vreg_zexti8:
290; GCN: s_waitcnt
291; GFX900-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:-4095
292; GFX900-NEXT: s_waitcnt
293; GFX900-NEXT: global_store_dword
294; GFX900-NEXT: s_waitcnt
295; GFX900-NEXT: s_setpc_b64
296define void @load_global_hi_v2i16_reglo_vreg_zexti8(i8 addrspace(1)* %in, i16 %reg) #0 {
297entry:
298  %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095
299  %load = load i8, i8 addrspace(1)* %gep
300  %ext = zext i8 %load to i16
301  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
302  %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
303  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
304  ret void
305}
306
307; GCN-LABEL: {{^}}load_global_hi_v2i16_reglo_vreg_sexti8:
308; GCN: s_waitcnt
309; GFX900-NEXT: global_load_sbyte_d16_hi v2, v[0:1], off offset:-4095
310; GFX900-NEXT: s_waitcnt
311; GFX900-NEXT: global_store_dword
312; GFX900-NEXT: s_waitcnt
313; GFX900-NEXT: s_setpc_b64
314define void @load_global_hi_v2i16_reglo_vreg_sexti8(i8 addrspace(1)* %in, i16 %reg) #0 {
315entry:
316  %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095
317  %load = load i8, i8 addrspace(1)* %gep
318  %ext = sext i8 %load to i16
319  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
320  %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
321  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
322  ret void
323}
324
325; GCN-LABEL: {{^}}load_global_hi_v2f16_reglo_vreg_sexti8:
326; GCN: s_waitcnt
327; GFX900-NEXT: global_load_sbyte_d16_hi v2, v[0:1], off offset:-4095
328; GFX900-NEXT: s_waitcnt
329; GFX900-NEXT: global_store_dword
330; GFX900-NEXT: s_waitcnt
331; GFX900-NEXT: s_setpc_b64
332define void @load_global_hi_v2f16_reglo_vreg_sexti8(i8 addrspace(1)* %in, half %reg) #0 {
333entry:
334  %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095
335  %load = load i8, i8 addrspace(1)* %gep
336  %ext = sext i8 %load to i16
337  %bitcast = bitcast i16 %ext to half
338  %build0 = insertelement <2 x half> undef, half %reg, i32 0
339  %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1
340  store <2 x half> %build1, <2 x half> addrspace(1)* undef
341  ret void
342}
343
344; GCN-LABEL: {{^}}load_global_hi_v2f16_reglo_vreg_zexti8:
345; GCN: s_waitcnt
346; GFX900-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:-4095
347; GFX900-NEXT: s_waitcnt
348; GFX900-NEXT: global_store_dword
349; GFX900-NEXT: s_waitcnt
350; GFX900-NEXT: s_setpc_b64
351define void @load_global_hi_v2f16_reglo_vreg_zexti8(i8 addrspace(1)* %in, half %reg) #0 {
352entry:
353  %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095
354  %load = load i8, i8 addrspace(1)* %gep
355  %ext = zext i8 %load to i16
356  %bitcast = bitcast i16 %ext to half
357  %build0 = insertelement <2 x half> undef, half %reg, i32 0
358  %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1
359  store <2 x half> %build1, <2 x half> addrspace(1)* undef
360  ret void
361}
362
363; GCN-LABEL: load_flat_hi_v2i16_reglo_vreg:
364; GCN: s_waitcnt
365; GFX900-NEXT: flat_load_short_d16_hi v2, v[0:1]
366; GFX900-NEXT: s_waitcnt
367; GFX900-NEXT: global_store_dword v[0:1], v2
368; GFX900-NEXT: s_waitcnt
369; GFX900-NEXT: s_setpc_b64
370
371; NO-D16-HI: flat_load_ushort v{{[0-9]+}}
372; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
373; GFX803: v_or_b32_sdwa
374; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16,
375define void @load_flat_hi_v2i16_reglo_vreg(i16* %in, i16 %reg) #0 {
376entry:
377  %load = load i16, i16* %in
378  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
379  %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
380  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
381  ret void
382}
383
384; GCN-LABEL: {{^}}load_flat_hi_v2f16_reglo_vreg:
385; GCN: s_waitcnt
386; GFX900-NEXT: flat_load_short_d16_hi v2, v[0:1]
387; GFX900-NEXT: s_waitcnt
388; GFX900-NEXT: global_store_dword v[0:1], v2
389; GFX900-NEXT: s_waitcnt
390; GFX900-NEXT: s_setpc_b64
391
392; NO-D16-HI: flat_load_ushort v{{[0-9]+}}
393; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
394; GFX803: v_or_b32_sdwa
395; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16,
396define void @load_flat_hi_v2f16_reglo_vreg(half* %in, half %reg) #0 {
397entry:
398  %load = load half, half* %in
399  %build0 = insertelement <2 x half> undef, half %reg, i32 0
400  %build1 = insertelement <2 x half> %build0, half %load, i32 1
401  store <2 x half> %build1, <2 x half> addrspace(1)* undef
402  ret void
403}
404
405; GCN-LABEL: {{^}}load_flat_hi_v2i16_reglo_vreg_zexti8:
406; GCN: s_waitcnt
407; GFX900-NEXT: flat_load_ubyte_d16_hi v2, v[0:1]
408; GFX900-NEXT: s_waitcnt
409; GFX900-NEXT: global_store_dword v[0:1], v2
410; GFX900-NEXT: s_waitcnt
411; GFX900-NEXT: s_setpc_b64
412
413; NO-D16-HI: flat_load_ubyte v{{[0-9]+}}
414; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
415; GFX803: v_or_b32_sdwa
416; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16,
417define void @load_flat_hi_v2i16_reglo_vreg_zexti8(i8* %in, i16 %reg) #0 {
418entry:
419  %load = load i8, i8* %in
420  %ext = zext i8 %load to i16
421  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
422  %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
423  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
424  ret void
425}
426
427; GCN-LABEL: {{^}}load_flat_hi_v2i16_reglo_vreg_sexti8:
428; GCN: s_waitcnt
429; GFX900-NEXT: flat_load_sbyte_d16_hi v2, v[0:1]
430; GFX900-NEXT: s_waitcnt
431; GFX900-NEXT: global_store_dword v[0:1], v2
432; GFX900-NEXT: s_waitcnt
433; GFX900-NEXT: s_setpc_b64
434
435; NO-D16-HI: flat_load_sbyte v{{[0-9]+}}
436; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
437; GFX803: v_or_b32_sdwa
438; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16,
439define void @load_flat_hi_v2i16_reglo_vreg_sexti8(i8* %in, i16 %reg) #0 {
440entry:
441  %load = load i8, i8* %in
442  %ext = sext i8 %load to i16
443  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
444  %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
445  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
446  ret void
447}
448
449; GCN-LABEL: {{^}}load_flat_hi_v2f16_reglo_vreg_zexti8:
450; GCN: s_waitcnt
451; GFX900-NEXT: flat_load_ubyte_d16_hi v2, v[0:1]
452; GFX900-NEXT: s_waitcnt
453; GFX900-NEXT: global_store_dword v[0:1], v2
454; GFX900-NEXT: s_waitcnt
455; GFX900-NEXT: s_setpc_b64
456
457; NO-D16-HI: flat_load_ubyte v{{[0-9]+}}
458; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
459; GFX803: v_or_b32_sdwa
460; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16,
461define void @load_flat_hi_v2f16_reglo_vreg_zexti8(i8* %in, half %reg) #0 {
462entry:
463  %load = load i8, i8* %in
464  %ext = zext i8 %load to i16
465  %bitcast = bitcast i16 %ext to half
466  %build0 = insertelement <2 x half> undef, half %reg, i32 0
467  %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1
468  store <2 x half> %build1, <2 x half> addrspace(1)* undef
469  ret void
470}
471
472; GCN-LABEL: {{^}}load_flat_hi_v2f16_reglo_vreg_sexti8:
473; GCN: s_waitcnt
474; GFX900-NEXT: flat_load_sbyte_d16_hi v2, v[0:1]
475; GFX900-NEXT: s_waitcnt
476; GFX900-NEXT: global_store_dword v[0:1], v2
477; GFX900-NEXT: s_waitcnt
478; GFX900-NEXT: s_setpc_b64
479
480; NO-D16-HI: flat_load_sbyte v{{[0-9]+}}
481; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
482; GFX803: v_or_b32_sdwa
483; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16,
484define void @load_flat_hi_v2f16_reglo_vreg_sexti8(i8* %in, half %reg) #0 {
485entry:
486  %load = load i8, i8* %in
487  %ext = sext i8 %load to i16
488  %bitcast = bitcast i16 %ext to half
489  %build0 = insertelement <2 x half> undef, half %reg, i32 0
490  %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1
491  store <2 x half> %build1, <2 x half> addrspace(1)* undef
492  ret void
493}
494
495; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg:
496; GCN: s_waitcnt
497; GFX900: buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:4094{{$}}
498; GFX900-NEXT: s_waitcnt
499; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
500; GFX900-NEXT: s_waitcnt
501; GFX900-NEXT: s_setpc_b64
502
503; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s32 offset:4094{{$}}
504define void @load_private_hi_v2i16_reglo_vreg(i16 addrspace(5)* byval %in, i16 %reg) #0 {
505entry:
506  %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i64 2047
507  %load = load i16, i16 addrspace(5)* %gep
508  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
509  %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
510  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
511  ret void
512}
513
514; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg:
515; GCN: s_waitcnt
516; GFX900: buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:4094{{$}}
517; GFX900-NEXT: s_waitcnt
518; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
519; GFX900-NEXT: s_waitcnt
520; GFX900-NEXT: s_setpc_b64
521
522; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s32 offset:4094{{$}}
523define void @load_private_hi_v2f16_reglo_vreg(half addrspace(5)* byval %in, half %reg) #0 {
524entry:
525  %gep = getelementptr inbounds half, half addrspace(5)* %in, i64 2047
526  %load = load half, half addrspace(5)* %gep
527  %build0 = insertelement <2 x half> undef, half %reg, i32 0
528  %build1 = insertelement <2 x half> %build0, half %load, i32 1
529  store <2 x half> %build1, <2 x half> addrspace(1)* undef
530  ret void
531}
532
533; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff:
534; GCN: s_waitcnt
535; GFX900: buffer_load_short_d16_hi v0, off, s[0:3], s33 offset:4094{{$}}
536; GFX900: s_waitcnt
537; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
538; GFX900-NEXT: s_waitcnt
539; GFX900-NEXT: s_setpc_b64
540
541; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s33 offset:4094{{$}}
542define void @load_private_hi_v2i16_reglo_vreg_nooff(i16 addrspace(5)* byval %in, i16 %reg) #0 {
543entry:
544  %load = load volatile i16, i16 addrspace(5)* inttoptr (i32 4094 to i16 addrspace(5)*)
545  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
546  %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
547  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
548  ret void
549}
550
551; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_nooff:
552; GCN: s_waitcnt
553; GFX900-NEXT: buffer_load_short_d16_hi v1, off, s[0:3], s33 offset:4094{{$}}
554; GFX900-NEXT: s_waitcnt
555; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
556; GFX900-NEXT: s_waitcnt
557; GFX900-NEXT: s_setpc_b64
558
559; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s33 offset:4094{{$}}
560define void @load_private_hi_v2f16_reglo_vreg_nooff(half addrspace(5)* %in, half %reg) #0 {
561entry:
562  %load = load volatile half, half addrspace(5)* inttoptr (i32 4094 to half addrspace(5)*)
563  %build0 = insertelement <2 x half> undef, half %reg, i32 0
564  %build1 = insertelement <2 x half> %build0, half %load, i32 1
565  store <2 x half> %build1, <2 x half> addrspace(1)* undef
566  ret void
567}
568
569; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_zexti8:
570; GCN: s_waitcnt
571; GFX900: buffer_load_ubyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}}
572; GFX900-NEXT: s_waitcnt
573; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
574; GFX900-NEXT: s_waitcnt
575; GFX900-NEXT: s_setpc_b64
576
577; NO-D16-HI: buffer_load_ubyte v{{[0-9]+}}, off, s[0:3], s32 offset:4095{{$}}
578define void @load_private_hi_v2i16_reglo_vreg_zexti8(i8 addrspace(5)* byval %in, i16 %reg) #0 {
579entry:
580  %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095
581  %load = load i8, i8 addrspace(5)* %gep
582  %ext = zext i8 %load to i16
583  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
584  %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
585  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
586  ret void
587}
588
589; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_zexti8:
590; GCN: s_waitcnt
591; GFX900: buffer_load_ubyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}}
592; GFX900-NEXT: s_waitcnt
593; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
594; GFX900-NEXT: s_waitcnt
595; GFX900-NEXT: s_setpc_b64
596
597; NO-D16-HI: buffer_load_ubyte v{{[0-9]+}}, off, s[0:3], s32 offset:4095{{$}}
598define void @load_private_hi_v2f16_reglo_vreg_zexti8(i8 addrspace(5)* byval %in, half %reg) #0 {
599entry:
600  %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095
601  %load = load i8, i8 addrspace(5)* %gep
602  %ext = zext i8 %load to i16
603  %bitcast = bitcast i16 %ext to half
604  %build0 = insertelement <2 x half> undef, half %reg, i32 0
605  %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1
606  store <2 x half> %build1, <2 x half> addrspace(1)* undef
607  ret void
608}
609
610; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_sexti8:
611; GCN: s_waitcnt
612; GFX900: buffer_load_sbyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}}
613; GFX900-NEXT: s_waitcnt
614; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
615; GFX900-NEXT: s_waitcnt
616; GFX900-NEXT: s_setpc_b64
617
618; NO-D16-HI: buffer_load_sbyte v{{[0-9]+}}, off, s[0:3], s32 offset:4095{{$}}
619define void @load_private_hi_v2f16_reglo_vreg_sexti8(i8 addrspace(5)* byval %in, half %reg) #0 {
620entry:
621  %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095
622  %load = load i8, i8 addrspace(5)* %gep
623  %ext = sext i8 %load to i16
624  %bitcast = bitcast i16 %ext to half
625  %build0 = insertelement <2 x half> undef, half %reg, i32 0
626  %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1
627  store <2 x half> %build1, <2 x half> addrspace(1)* undef
628  ret void
629}
630
631; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_sexti8:
632; GCN: s_waitcnt
633; GFX900: buffer_load_sbyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}}
634; GFX900-NEXT: s_waitcnt
635; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
636; GFX900-NEXT: s_waitcnt
637; GFX900-NEXT: s_setpc_b64
638
639; NO-D16-HI: buffer_load_sbyte v{{[0-9]+}}, off, s[0:3], s32 offset:4095{{$}}
640define void @load_private_hi_v2i16_reglo_vreg_sexti8(i8 addrspace(5)* byval %in, i16 %reg) #0 {
641entry:
642  %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095
643  %load = load i8, i8 addrspace(5)* %gep
644  %ext = sext i8 %load to i16
645  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
646  %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
647  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
648  ret void
649}
650
651; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff_zexti8:
652; GCN: s_waitcnt
653; GFX900-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], s33 offset:4094{{$}}
654; GFX900-NEXT: s_waitcnt
655; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
656; GFX900-NEXT: s_waitcnt
657; GFX900-NEXT: s_setpc_b64
658
659; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], s33 offset:4094{{$}}
660define void @load_private_hi_v2i16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, i16 %reg) #0 {
661entry:
662  %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*)
663  %ext = zext i8 %load to i16
664  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
665  %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
666  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
667  ret void
668}
669
670; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff_sexti8:
671; GCN: s_waitcnt
672; GFX900-NEXT: buffer_load_sbyte_d16_hi v1, off, s[0:3], s33 offset:4094{{$}}
673; GFX900-NEXT: s_waitcnt
674; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
675; GFX900-NEXT: s_waitcnt
676; GFX900-NEXT: s_setpc_b64
677
678; NO-D16-HI: buffer_load_sbyte v0, off, s[0:3], s33 offset:4094{{$}}
679define void @load_private_hi_v2i16_reglo_vreg_nooff_sexti8(i8 addrspace(5)* %in, i16 %reg) #0 {
680entry:
681  %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*)
682  %ext = sext i8 %load to i16
683  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
684  %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
685  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
686  ret void
687}
688
689; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_nooff_zexti8:
690; GCN: s_waitcnt
691; GFX900-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], s33 offset:4094{{$}}
692; GFX900-NEXT: s_waitcnt
693; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
694; GFX900-NEXT: s_waitcnt
695; GFX900-NEXT: s_setpc_b64
696
697; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], s33 offset:4094{{$}}
698define void @load_private_hi_v2f16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, half %reg) #0 {
699entry:
700  %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*)
701  %ext = zext i8 %load to i16
702  %bc.ext = bitcast i16 %ext to half
703  %build0 = insertelement <2 x half> undef, half %reg, i32 0
704  %build1 = insertelement <2 x half> %build0, half %bc.ext, i32 1
705  store <2 x half> %build1, <2 x half> addrspace(1)* undef
706  ret void
707}
708
709; GCN-LABEL: {{^}}load_constant_hi_v2i16_reglo_vreg:
710; GCN: s_waitcnt
711; GFX900-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094
712; GFX900-NEXT: s_waitcnt
713; GFX900-NEXT: global_store_dword
714; GFX900-NEXT: s_waitcnt
715; GFX900-NEXT: s_setpc_b64
716
717; GFX803: flat_load_ushort
718; GFX906: global_load_ushort
719define void @load_constant_hi_v2i16_reglo_vreg(i16 addrspace(4)* %in, i16 %reg) #0 {
720entry:
721  %gep = getelementptr inbounds i16, i16 addrspace(4)* %in, i64 -2047
722  %load = load i16, i16 addrspace(4)* %gep
723  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
724  %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
725  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
726  ret void
727}
728
729; GCN-LABEL: load_constant_hi_v2f16_reglo_vreg
730; GCN: s_waitcnt
731; GFX900-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094
732; GFX900-NEXT: s_waitcnt
733; GFX900-NEXT: global_store_dword
734; GFX900-NEXT: s_waitcnt
735; GFX900-NEXT: s_setpc_b64
736
737; GFX803: flat_load_ushort
738; GFX906: global_load_ushort
739define void @load_constant_hi_v2f16_reglo_vreg(half addrspace(4)* %in, half %reg) #0 {
740entry:
741  %gep = getelementptr inbounds half, half addrspace(4)* %in, i64 -2047
742  %load = load half, half addrspace(4)* %gep
743  %build0 = insertelement <2 x half> undef, half %reg, i32 0
744  %build1 = insertelement <2 x half> %build0, half %load, i32 1
745  store <2 x half> %build1, <2 x half> addrspace(1)* undef
746  ret void
747}
748
749; GCN-LABEL: {{^}}load_constant_hi_v2f16_reglo_vreg_sexti8:
750; GCN: s_waitcnt
751; GFX900-NEXT: global_load_sbyte_d16_hi v2, v[0:1], off offset:-4095
752; GFX900-NEXT: s_waitcnt
753; GFX900-NEXT: global_store_dword
754; GFX900-NEXT: s_waitcnt
755; GFX900-NEXT: s_setpc_b64
756define void @load_constant_hi_v2f16_reglo_vreg_sexti8(i8 addrspace(4)* %in, half %reg) #0 {
757entry:
758  %gep = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 -4095
759  %load = load i8, i8 addrspace(4)* %gep
760  %ext = sext i8 %load to i16
761  %bitcast = bitcast i16 %ext to half
762  %build0 = insertelement <2 x half> undef, half %reg, i32 0
763  %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1
764  store <2 x half> %build1, <2 x half> addrspace(1)* undef
765  ret void
766}
767
768; GCN-LABEL: {{^}}load_constant_hi_v2f16_reglo_vreg_zexti8:
769; GCN: s_waitcnt
770; GFX900-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:-4095
771; GFX900-NEXT: s_waitcnt
772; GFX900-NEXT: global_store_dword
773; GFX900-NEXT: s_waitcnt
774; GFX900-NEXT: s_setpc_b64
775define void @load_constant_hi_v2f16_reglo_vreg_zexti8(i8 addrspace(4)* %in, half %reg) #0 {
776entry:
777  %gep = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 -4095
778  %load = load i8, i8 addrspace(4)* %gep
779  %ext = zext i8 %load to i16
780  %bitcast = bitcast i16 %ext to half
781  %build0 = insertelement <2 x half> undef, half %reg, i32 0
782  %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1
783  store <2 x half> %build1, <2 x half> addrspace(1)* undef
784  ret void
785}
786
787; Local object gives known offset, so requires converting from offen
788; to offset variant.
789
790; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_to_offset:
791; GFX900: buffer_store_dword
792; GFX900-NEXT: buffer_load_short_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4094
793define void @load_private_hi_v2i16_reglo_vreg_to_offset(i16 %reg) #0 {
794entry:
795  %obj0 = alloca [10 x i32], align 4, addrspace(5)
796  %obj1 = alloca [4096 x i16], align 2, addrspace(5)
797  %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
798  store volatile i32 123, i32 addrspace(5)* %bc
799  %gep = getelementptr inbounds [4096 x i16], [4096 x i16] addrspace(5)* %obj1, i32 0, i32 2027
800  %load = load i16, i16 addrspace(5)* %gep
801  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
802  %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
803  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
804  ret void
805}
806
807; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_sexti8_to_offset:
808; GFX900: buffer_store_dword
809; GFX900-NEXT: buffer_load_sbyte_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4095
810define void @load_private_hi_v2i16_reglo_vreg_sexti8_to_offset(i16 %reg) #0 {
811entry:
812  %obj0 = alloca [10 x i32], align 4, addrspace(5)
813  %obj1 = alloca [4096 x i8], align 2, addrspace(5)
814  %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
815  store volatile i32 123, i32 addrspace(5)* %bc
816  %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4055
817  %load = load i8, i8 addrspace(5)* %gep
818  %ext = sext i8 %load to i16
819  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
820  %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
821  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
822  ret void
823}
824
825; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_zexti8_to_offset:
826; GFX900: buffer_store_dword
827; GFX900-NEXT: buffer_load_ubyte_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4095
828define void @load_private_hi_v2i16_reglo_vreg_zexti8_to_offset(i16 %reg) #0 {
829entry:
830  %obj0 = alloca [10 x i32], align 4, addrspace(5)
831  %obj1 = alloca [4096 x i8], align 2, addrspace(5)
832  %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
833  store volatile i32 123, i32 addrspace(5)* %bc
834  %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4055
835  %load = load i8, i8 addrspace(5)* %gep
836  %ext = zext i8 %load to i16
837  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
838  %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
839  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
840  ret void
841}
842
843; FIXME: Remove m0 init and waitcnt between reads
844; FIXME: Is there a cost to using the extload over not?
845; GCN-LABEL: {{^}}load_local_v2i16_split_multi_chain:
846; GCN: s_waitcnt
847; GFX900-NEXT: ds_read_u16 v1, v0
848; GFX900-NEXT: s_waitcnt
849; GFX900-NEXT: ds_read_u16_d16_hi v1, v0 offset:2
850; GFX900-NEXT: s_waitcnt
851; GFX900-NEXT: v_mov_b32_e32 v0, v1
852; GFX900-NEXT: s_setpc_b64
853define <2 x i16> @load_local_v2i16_split_multi_chain(i16 addrspace(3)* %in) #0 {
854entry:
855  %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 1
856  %load0 = load volatile i16, i16 addrspace(3)* %in
857  %load1 = load volatile i16, i16 addrspace(3)* %gep
858  %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0
859  %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1
860  ret <2 x i16> %build1
861}
862
863; GCN-LABEL: {{^}}load_local_lo_hi_v2i16_samechain:
864; GFX900: ds_read_u16 v1, v0
865; GFX900-NEXT: s_waitcnt lgkmcnt(0)
866; GFX900-NEXT: ds_read_u16_d16_hi v1, v0 offset:16
867; GFX900-NEXT: s_waitcnt lgkmcnt(0)
868; GFX900-NEXT: v_mov_b32_e32 v0, v1
869; GFX900-NEXT: s_setpc_b64
870
871; NO-D16-HI: ds_read_u16
872; NO-D16-HI: ds_read_u16
873define <2 x i16> @load_local_lo_hi_v2i16_samechain(i16 addrspace(3)* %in) #0 {
874entry:
875  %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 8
876  %load.lo = load i16, i16 addrspace(3)* %in
877  %load.hi = load i16, i16 addrspace(3)* %gep
878  %build0 = insertelement <2 x i16> undef, i16 %load.lo, i32 0
879  %build1 = insertelement <2 x i16> %build0, i16 %load.hi, i32 1
880  ret <2 x i16> %build1
881}
882
883; FIXME: Remove and
884; GCN-LABEL: {{^}}load_local_v2i16_broadcast:
885; GCN: ds_read_u16 [[LOAD:v[0-9]+]]
886; GCN-NOT: ds_read
887; GFX9: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, [[LOAD]]
888; GFX9: v_lshl_or_b32 v0, [[LOAD]], 16, [[AND]]
889define <2 x i16> @load_local_v2i16_broadcast(i16 addrspace(3)* %in) #0 {
890entry:
891  %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 1
892  %load0 = load i16, i16 addrspace(3)* %in
893  %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0
894  %build1 = insertelement <2 x i16> %build0, i16 %load0, i32 1
895  ret <2 x i16> %build1
896}
897
898; GCN-LABEL: {{^}}load_local_lo_hi_v2i16_side_effect:
899; GFX900: ds_read_u16 [[LOAD0:v[0-9]+]], v0
900; GFX900: ds_write_b16
901; GFX900: ds_read_u16_d16_hi [[LOAD0]], v0 offset:16
902
903; NO-D16-HI: ds_read_u16
904; NO-D16-HI: ds_write_b16
905; NO-D16-HI: ds_read_u16
906define <2 x i16> @load_local_lo_hi_v2i16_side_effect(i16 addrspace(3)* %in, i16 addrspace(3)* %may.alias) #0 {
907entry:
908  %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 8
909  %load.lo = load i16, i16 addrspace(3)* %in
910  store i16 123, i16 addrspace(3)* %may.alias
911  %load.hi = load i16, i16 addrspace(3)* %gep
912  %build0 = insertelement <2 x i16> undef, i16 %load.lo, i32 0
913  %build1 = insertelement <2 x i16> %build0, i16 %load.hi, i32 1
914  ret <2 x i16> %build1
915}
916
917; FIXME: Remove waitcnt between reads
918; GCN-LABEL: {{^}}load_global_v2i16_split:
919; GCN: s_waitcnt
920; GFX900-NEXT: global_load_ushort v2
921; GFX900-NEXT: s_waitcnt
922; GFX900-NEXT: global_load_short_d16_hi v2
923; GFX900-NEXT: s_waitcnt
924; GFX900-NEXT: v_mov_b32_e32 v0, v2
925; GFX900-NEXT: s_setpc_b64
926define <2 x i16> @load_global_v2i16_split(i16 addrspace(1)* %in) #0 {
927entry:
928  %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 1
929  %load0 = load volatile i16, i16 addrspace(1)* %in
930  %load1 = load volatile i16, i16 addrspace(1)* %gep
931  %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0
932  %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1
933  ret <2 x i16> %build1
934}
935
936; FIXME: Remove waitcnt between reads
937; GCN-LABEL: {{^}}load_flat_v2i16_split:
938; GCN: s_waitcnt
939; GFX900-NEXT: flat_load_ushort v2
940; GFX900-NEXT: s_waitcnt
941; GFX900-NEXT: flat_load_short_d16_hi v2
942; GFX900-NEXT: s_waitcnt
943; GFX900-NEXT: v_mov_b32_e32 v0, v2
944; GFX900-NEXT: s_setpc_b64
945define <2 x i16> @load_flat_v2i16_split(i16* %in) #0 {
946entry:
947  %gep = getelementptr inbounds i16, i16* %in, i64 1
948  %load0 = load volatile i16, i16* %in
949  %load1 = load volatile i16, i16* %gep
950  %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0
951  %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1
952  ret <2 x i16> %build1
953}
954
955; FIXME: Remove waitcnt between reads
956; GCN-LABEL: {{^}}load_constant_v2i16_split:
957; GCN: s_waitcnt
958; GFX900-NEXT: global_load_ushort v2
959; GFX900-NEXT: s_waitcnt
960; GFX900-NEXT: global_load_short_d16_hi v2
961; GFX900-NEXT: s_waitcnt
962; GFX900-NEXT: v_mov_b32_e32 v0, v2
963; GFX900-NEXT: s_setpc_b64
964define <2 x i16> @load_constant_v2i16_split(i16 addrspace(4)* %in) #0 {
965entry:
966  %gep = getelementptr inbounds i16, i16 addrspace(4)* %in, i64 1
967  %load0 = load volatile i16, i16 addrspace(4)* %in
968  %load1 = load volatile i16, i16 addrspace(4)* %gep
969  %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0
970  %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1
971  ret <2 x i16> %build1
972}
973
974; FIXME: Remove m0 init and waitcnt between reads
975; FIXME: Is there a cost to using the extload over not?
976; GCN-LABEL: {{^}}load_private_v2i16_split:
977; GCN: s_waitcnt
978; GFX900: buffer_load_ushort v0, off, s[0:3], s32{{$}}
979; GFX900-NEXT: s_waitcnt
980; GFX900-NEXT: buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:2
981; GFX900-NEXT: s_waitcnt
982; GFX900-NEXT: s_setpc_b64
983define <2 x i16> @load_private_v2i16_split(i16 addrspace(5)* byval %in) #0 {
984entry:
985  %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i32 1
986  %load0 = load volatile i16, i16 addrspace(5)* %in
987  %load1 = load volatile i16, i16 addrspace(5)* %gep
988  %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0
989  %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1
990  ret <2 x i16> %build1
991}
992
993attributes #0 = { nounwind }
994