1; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900 %s
2; RUN: llc -march=amdgcn -mcpu=gfx906 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX906,NO-D16-HI %s
3; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX803,NO-D16-HI %s
4
5; GCN-LABEL: {{^}}load_local_lo_hi_v2i16_multi_use_lo:
6; GFX900: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7; GFX900-NEXT: ds_read_u16 v2, v0
8; GFX900-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
9; GFX900-DAG: s_waitcnt lgkmcnt(0)
10; GFX900-DAG: v_mov_b32_e32 v1, v2
11; GFX900-DAG: ds_read_u16_d16_hi v1, v0 offset:16
12; GFX900: ds_write_b16 [[ZERO]], v2
13; GFX900-NEXT: s_waitcnt lgkmcnt(1)
14; GFX900-NEXT: v_mov_b32_e32 v0, v1
15; GFX900-NEXT: s_waitcnt lgkmcnt(0)
16; GFX900-NEXT: s_setpc_b64 s[30:31]
17define <2 x i16> @load_local_lo_hi_v2i16_multi_use_lo(i16 addrspace(3)* noalias %in) #0 {
18entry:
19  %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 8
20  %load.lo = load i16, i16 addrspace(3)* %in
21  %load.hi = load i16, i16 addrspace(3)* %gep
22  store i16 %load.lo, i16 addrspace(3)* null
23  %build0 = insertelement <2 x i16> undef, i16 %load.lo, i32 0
24  %build1 = insertelement <2 x i16> %build0, i16 %load.hi, i32 1
25  ret <2 x i16> %build1
26}
27
28; GCN-LABEL: {{^}}load_local_lo_hi_v2i16_multi_use_hi:
29; GFX900: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30; GFX900-DAG: ds_read_u16 [[LO:v[0-9]+]], v0
31; GFX900-DAG: ds_read_u16 [[HI:v[0-9]+]], v0 offset:16
32; GFX900-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
33; GFX900-DAG: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, [[LO]]
34; GFX900-DAG: s_waitcnt lgkmcnt(0)
35; GFX900-DAG: ds_write_b16 [[ZERO]], [[HI]]
36; GFX900: v_lshl_or_b32 [[HI]], [[HI]], 16, [[AND]]
37; GFX900-NEXT: s_waitcnt lgkmcnt(0)
38; GFX900-NEXT: s_setpc_b64 s[30:31]
39define <2 x i16> @load_local_lo_hi_v2i16_multi_use_hi(i16 addrspace(3)* noalias %in) #0 {
40entry:
41  %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 8
42  %load.lo = load i16, i16 addrspace(3)* %in
43  %load.hi = load i16, i16 addrspace(3)* %gep
44  store i16 %load.hi, i16 addrspace(3)* null
45  %build0 = insertelement <2 x i16> undef, i16 %load.lo, i32 0
46  %build1 = insertelement <2 x i16> %build0, i16 %load.hi, i32 1
47  ret <2 x i16> %build1
48}
49
50; GCN-LABEL: {{^}}load_local_lo_hi_v2i16_multi_use_lohi:
51; GFX900: ds_read_u16 v3, v0
52; GFX900-NEXT: ds_read_u16 v0, v0 offset:16
53; GFX900-NEXT: s_waitcnt lgkmcnt(1)
54; GFX900-NEXT: ds_write_b16 v1, v3
55; GFX900-NEXT: s_waitcnt lgkmcnt(1)
56; GFX900-NEXT: ds_write_b16 v2, v0
57; GFX900-NEXT: v_and_b32_e32 v1, 0xffff, v3
58; GFX900-NEXT: v_lshl_or_b32 v0, v0, 16, v1
59; GFX900-NEXT: s_waitcnt lgkmcnt(0)
60; GFX900-NEXT: s_setpc_b64 s[30:31]
61define <2 x i16> @load_local_lo_hi_v2i16_multi_use_lohi(i16 addrspace(3)* noalias %in, i16 addrspace(3)* noalias %out0, i16 addrspace(3)* noalias %out1) #0 {
62entry:
63  %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 8
64  %load.lo = load i16, i16 addrspace(3)* %in
65  %load.hi = load i16, i16 addrspace(3)* %gep
66  store i16 %load.lo, i16 addrspace(3)* %out0
67  store i16 %load.hi, i16 addrspace(3)* %out1
68  %build0 = insertelement <2 x i16> undef, i16 %load.lo, i32 0
69  %build1 = insertelement <2 x i16> %build0, i16 %load.hi, i32 1
70  ret <2 x i16> %build1
71}
72
73; GCN-LABEL: {{^}}load_local_hi_v2i16_undeflo:
74; GCN: s_waitcnt
75; GFX900-NEXT: ds_read_u16_d16_hi v0, v0
76; GFX900-NEXT: s_waitcnt
77; GFX900-NEXT: s_setpc_b64
78
79; NO-D16-HI: ds_read_u16 v
80define <2 x i16> @load_local_hi_v2i16_undeflo(i16 addrspace(3)* %in) #0 {
81entry:
82  %load = load i16, i16 addrspace(3)* %in
83  %build = insertelement <2 x i16> undef, i16 %load, i32 1
84  ret <2 x i16> %build
85}
86
87; GCN-LABEL: {{^}}load_local_hi_v2i16_reglo:
88; GCN: s_waitcnt
89; GFX900-NEXT: ds_read_u16_d16_hi v1, v0
90; GFX900-NEXT: s_waitcnt
91; GFX900-NEXT: v_mov_b32_e32 v0, v1
92; GFX900-NEXT: s_setpc_b64
93
94; NO-D16-HI: ds_read_u16 v
95define <2 x i16> @load_local_hi_v2i16_reglo(i16 addrspace(3)* %in, i16 %reg) #0 {
96entry:
97  %load = load i16, i16 addrspace(3)* %in
98  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
99  %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
100  ret <2 x i16> %build1
101}
102
103; Show that we get reasonable regalloc without physreg constraints.
104; GCN-LABEL: {{^}}load_local_hi_v2i16_reglo_vreg:
105; GCN: s_waitcnt
106; GFX900-NEXT: ds_read_u16_d16_hi v1, v0
107; GFX900-NEXT: s_waitcnt
108; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}}
109; GFX900-NEXT: s_waitcnt
110; GFX900-NEXT: s_setpc_b64
111
112; NO-D16-HI: ds_read_u16 v
113define void @load_local_hi_v2i16_reglo_vreg(i16 addrspace(3)* %in, i16 %reg) #0 {
114entry:
115  %load = load i16, i16 addrspace(3)* %in
116  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
117  %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
118  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
119  ret void
120}
121
122; GCN-LABEL: {{^}}load_local_hi_v2i16_zerolo:
123; GCN: s_waitcnt
124; GFX900-NEXT: v_mov_b32_e32 v1, 0
125; GFX900-NEXT: ds_read_u16_d16_hi v1, v0
126; GFX900-NEXT: s_waitcnt
127; GFX900-NEXT: v_mov_b32_e32 v0, v1
128; GFX900-NEXT: s_setpc_b64
129
130; NO-D16-HI: ds_read_u16 v
131define <2 x i16> @load_local_hi_v2i16_zerolo(i16 addrspace(3)* %in) #0 {
132entry:
133  %load = load i16, i16 addrspace(3)* %in
134  %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 1
135  ret <2 x i16> %build
136}
137
138; FIXME: Remove m0 initialization
139; GCN-LABEL: {{^}}load_local_hi_v2i16_zerolo_shift:
140; GCN: s_waitcnt
141; GFX900-NEXT: ds_read_u16 v0, v0
142; GFX900-NEXT: s_waitcnt lgkmcnt(0)
143; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0
144; GFX900-NEXT: s_setpc_b64
145
146; NO-D16-HI: ds_read_u16 v
147; NO-D16-HI: v_lshlrev_b32_e32 v0, 16, v0
148define i32 @load_local_hi_v2i16_zerolo_shift(i16 addrspace(3)* %in) #0 {
149entry:
150  %load = load i16, i16 addrspace(3)* %in
151  %zext = zext i16 %load to i32
152  %shift = shl i32 %zext, 16
153  ret i32 %shift
154}
155
156; GCN-LABEL: {{^}}load_local_hi_v2f16_reglo_vreg:
157; GCN: s_waitcnt
158; GFX900-NEXT: ds_read_u16_d16_hi v1, v0
159; GFX900-NEXT: s_waitcnt
160; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}}
161; GFX900-NEXT: s_waitcnt
162; GFX900-NEXT: s_setpc_b64
163
164; NO-D16-HI: ds_read_u16 v
165define void @load_local_hi_v2f16_reglo_vreg(half addrspace(3)* %in, half %reg) #0 {
166entry:
167  %load = load half, half addrspace(3)* %in
168  %build0 = insertelement <2 x half> undef, half %reg, i32 0
169  %build1 = insertelement <2 x half> %build0, half %load, i32 1
170  store <2 x half> %build1, <2 x half> addrspace(1)* undef
171  ret void
172}
173
174; GCN-LABEL: {{^}}load_local_hi_v2i16_reglo_vreg_zexti8:
175; GCN: s_waitcnt
176; GFX900-NEXT: ds_read_u8_d16_hi v1, v0
177; GFX900-NEXT: s_waitcnt
178; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}}
179; GFX900-NEXT: s_waitcnt
180; GFX900-NEXT: s_setpc_b64
181
182; NO-D16-HI: ds_read_u8 v
183define void @load_local_hi_v2i16_reglo_vreg_zexti8(i8 addrspace(3)* %in, i16 %reg) #0 {
184entry:
185  %load = load i8, i8 addrspace(3)* %in
186  %ext = zext i8 %load to i16
187  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
188  %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
189  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
190  ret void
191}
192
193; GCN-LABEL: {{^}}load_local_hi_v2i16_reglo_vreg_sexti8:
194; GCN: s_waitcnt
195; GFX900-NEXT: ds_read_i8_d16_hi v1, v0
196; GFX900-NEXT: s_waitcnt
197; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}}
198; GFX900-NEXT: s_waitcnt
199; GFX900-NEXT: s_setpc_b64
200
201; NO-D16-HI: ds_read_i8 v
202define void @load_local_hi_v2i16_reglo_vreg_sexti8(i8 addrspace(3)* %in, i16 %reg) #0 {
203entry:
204  %load = load i8, i8 addrspace(3)* %in
205  %ext = sext i8 %load to i16
206  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
207  %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
208  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
209  ret void
210}
211
212; GCN-LABEL: {{^}}load_local_hi_v2f16_reglo_vreg_zexti8:
213; GCN: s_waitcnt
214; GFX900-NEXT: ds_read_u8_d16_hi v1, v0
215; GFX900-NEXT: s_waitcnt
216; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}}
217; GFX900-NEXT: s_waitcnt
218; GFX900-NEXT: s_setpc_b64
219
220; NO-D16-HI: ds_read_u8 v
221define void @load_local_hi_v2f16_reglo_vreg_zexti8(i8 addrspace(3)* %in, half %reg) #0 {
222entry:
223  %load = load i8, i8 addrspace(3)* %in
224  %ext = zext i8 %load to i16
225  %bitcast = bitcast i16 %ext to half
226
227  %build0 = insertelement <2 x half> undef, half %reg, i32 0
228  %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1
229  store <2 x half> %build1, <2 x half> addrspace(1)* undef
230  ret void
231}
232
233; GCN-LABEL: {{^}}load_local_hi_v2f16_reglo_vreg_sexti8:
234; GCN: s_waitcnt
235; GFX900-NEXT: ds_read_i8_d16_hi v1, v0
236; GFX900-NEXT: s_waitcnt
237; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}}
238; GFX900-NEXT: s_waitcnt
239; GFX900-NEXT: s_setpc_b64
240
241; NO-D16-HI: ds_read_i8 v
242define void @load_local_hi_v2f16_reglo_vreg_sexti8(i8 addrspace(3)* %in, half %reg) #0 {
243entry:
244  %load = load i8, i8 addrspace(3)* %in
245  %ext = sext i8 %load to i16
246  %bitcast = bitcast i16 %ext to half
247
248  %build0 = insertelement <2 x half> undef, half %reg, i32 0
249  %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1
250  store <2 x half> %build1, <2 x half> addrspace(1)* undef
251  ret void
252}
253
254; GCN-LABEL: {{^}}load_global_hi_v2i16_reglo_vreg:
255; GCN: s_waitcnt
256; GFX900-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094
257; GFX900-NEXT: s_waitcnt
258; GFX900-NEXT: global_store_dword
259; GFX900-NEXT: s_waitcnt
260; GFX900-NEXT: s_setpc_b64
261define void @load_global_hi_v2i16_reglo_vreg(i16 addrspace(1)* %in, i16 %reg) #0 {
262entry:
263  %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 -2047
264  %load = load i16, i16 addrspace(1)* %gep
265  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
266  %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
267  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
268  ret void
269}
270
271; GCN-LABEL: {{^}}load_global_hi_v2f16_reglo_vreg:
272; GCN: s_waitcnt
273; GFX900-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094
274; GFX900-NEXT: s_waitcnt
275; GFX900-NEXT: global_store_dword
276; GFX900-NEXT: s_waitcnt
277; GFX900-NEXT: s_setpc_b64
278define void @load_global_hi_v2f16_reglo_vreg(half addrspace(1)* %in, half %reg) #0 {
279entry:
280  %gep = getelementptr inbounds half, half addrspace(1)* %in, i64 -2047
281  %load = load half, half addrspace(1)* %gep
282  %build0 = insertelement <2 x half> undef, half %reg, i32 0
283  %build1 = insertelement <2 x half> %build0, half %load, i32 1
284  store <2 x half> %build1, <2 x half> addrspace(1)* undef
285  ret void
286}
287
288; GCN-LABEL: {{^}}load_global_hi_v2i16_reglo_vreg_zexti8:
289; GCN: s_waitcnt
290; GFX900-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:-4095
291; GFX900-NEXT: s_waitcnt
292; GFX900-NEXT: global_store_dword
293; GFX900-NEXT: s_waitcnt
294; GFX900-NEXT: s_setpc_b64
295define void @load_global_hi_v2i16_reglo_vreg_zexti8(i8 addrspace(1)* %in, i16 %reg) #0 {
296entry:
297  %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095
298  %load = load i8, i8 addrspace(1)* %gep
299  %ext = zext i8 %load to i16
300  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
301  %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
302  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
303  ret void
304}
305
306; GCN-LABEL: {{^}}load_global_hi_v2i16_reglo_vreg_sexti8:
307; GCN: s_waitcnt
308; GFX900-NEXT: global_load_sbyte_d16_hi v2, v[0:1], off offset:-4095
309; GFX900-NEXT: s_waitcnt
310; GFX900-NEXT: global_store_dword
311; GFX900-NEXT: s_waitcnt
312; GFX900-NEXT: s_setpc_b64
313define void @load_global_hi_v2i16_reglo_vreg_sexti8(i8 addrspace(1)* %in, i16 %reg) #0 {
314entry:
315  %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095
316  %load = load i8, i8 addrspace(1)* %gep
317  %ext = sext i8 %load to i16
318  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
319  %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
320  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
321  ret void
322}
323
324; GCN-LABEL: {{^}}load_global_hi_v2f16_reglo_vreg_sexti8:
325; GCN: s_waitcnt
326; GFX900-NEXT: global_load_sbyte_d16_hi v2, v[0:1], off offset:-4095
327; GFX900-NEXT: s_waitcnt
328; GFX900-NEXT: global_store_dword
329; GFX900-NEXT: s_waitcnt
330; GFX900-NEXT: s_setpc_b64
331define void @load_global_hi_v2f16_reglo_vreg_sexti8(i8 addrspace(1)* %in, half %reg) #0 {
332entry:
333  %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095
334  %load = load i8, i8 addrspace(1)* %gep
335  %ext = sext i8 %load to i16
336  %bitcast = bitcast i16 %ext to half
337  %build0 = insertelement <2 x half> undef, half %reg, i32 0
338  %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1
339  store <2 x half> %build1, <2 x half> addrspace(1)* undef
340  ret void
341}
342
343; GCN-LABEL: {{^}}load_global_hi_v2f16_reglo_vreg_zexti8:
344; GCN: s_waitcnt
345; GFX900-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:-4095
346; GFX900-NEXT: s_waitcnt
347; GFX900-NEXT: global_store_dword
348; GFX900-NEXT: s_waitcnt
349; GFX900-NEXT: s_setpc_b64
350define void @load_global_hi_v2f16_reglo_vreg_zexti8(i8 addrspace(1)* %in, half %reg) #0 {
351entry:
352  %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095
353  %load = load i8, i8 addrspace(1)* %gep
354  %ext = zext i8 %load to i16
355  %bitcast = bitcast i16 %ext to half
356  %build0 = insertelement <2 x half> undef, half %reg, i32 0
357  %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1
358  store <2 x half> %build1, <2 x half> addrspace(1)* undef
359  ret void
360}
361
362; GCN-LABEL: load_flat_hi_v2i16_reglo_vreg:
363; GCN: s_waitcnt
364; GFX900-NEXT: flat_load_short_d16_hi v2, v[0:1]
365; GFX900-NEXT: s_waitcnt
366; GFX900-NEXT: global_store_dword v[0:1], v2
367; GFX900-NEXT: s_waitcnt
368; GFX900-NEXT: s_setpc_b64
369
370; NO-D16-HI: flat_load_ushort v{{[0-9]+}}
371; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
372; GFX803: v_or_b32_sdwa
373; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16,
374define void @load_flat_hi_v2i16_reglo_vreg(i16* %in, i16 %reg) #0 {
375entry:
376  %load = load i16, i16* %in
377  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
378  %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
379  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
380  ret void
381}
382
383; GCN-LABEL: {{^}}load_flat_hi_v2f16_reglo_vreg:
384; GCN: s_waitcnt
385; GFX900-NEXT: flat_load_short_d16_hi v2, v[0:1]
386; GFX900-NEXT: s_waitcnt
387; GFX900-NEXT: global_store_dword v[0:1], v2
388; GFX900-NEXT: s_waitcnt
389; GFX900-NEXT: s_setpc_b64
390
391; NO-D16-HI: flat_load_ushort v{{[0-9]+}}
392; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
393; GFX803: v_or_b32_sdwa
394; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16,
395define void @load_flat_hi_v2f16_reglo_vreg(half* %in, half %reg) #0 {
396entry:
397  %load = load half, half* %in
398  %build0 = insertelement <2 x half> undef, half %reg, i32 0
399  %build1 = insertelement <2 x half> %build0, half %load, i32 1
400  store <2 x half> %build1, <2 x half> addrspace(1)* undef
401  ret void
402}
403
404; GCN-LABEL: {{^}}load_flat_hi_v2i16_reglo_vreg_zexti8:
405; GCN: s_waitcnt
406; GFX900-NEXT: flat_load_ubyte_d16_hi v2, v[0:1]
407; GFX900-NEXT: s_waitcnt
408; GFX900-NEXT: global_store_dword v[0:1], v2
409; GFX900-NEXT: s_waitcnt
410; GFX900-NEXT: s_setpc_b64
411
412; NO-D16-HI: flat_load_ubyte v{{[0-9]+}}
413; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
414; GFX803: v_or_b32_sdwa
415; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16,
416define void @load_flat_hi_v2i16_reglo_vreg_zexti8(i8* %in, i16 %reg) #0 {
417entry:
418  %load = load i8, i8* %in
419  %ext = zext i8 %load to i16
420  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
421  %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
422  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
423  ret void
424}
425
426; GCN-LABEL: {{^}}load_flat_hi_v2i16_reglo_vreg_sexti8:
427; GCN: s_waitcnt
428; GFX900-NEXT: flat_load_sbyte_d16_hi v2, v[0:1]
429; GFX900-NEXT: s_waitcnt
430; GFX900-NEXT: global_store_dword v[0:1], v2
431; GFX900-NEXT: s_waitcnt
432; GFX900-NEXT: s_setpc_b64
433
434; NO-D16-HI: flat_load_sbyte v{{[0-9]+}}
435; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
436; GFX803: v_or_b32_sdwa
437; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16,
438define void @load_flat_hi_v2i16_reglo_vreg_sexti8(i8* %in, i16 %reg) #0 {
439entry:
440  %load = load i8, i8* %in
441  %ext = sext i8 %load to i16
442  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
443  %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
444  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
445  ret void
446}
447
448; GCN-LABEL: {{^}}load_flat_hi_v2f16_reglo_vreg_zexti8:
449; GCN: s_waitcnt
450; GFX900-NEXT: flat_load_ubyte_d16_hi v2, v[0:1]
451; GFX900-NEXT: s_waitcnt
452; GFX900-NEXT: global_store_dword v[0:1], v2
453; GFX900-NEXT: s_waitcnt
454; GFX900-NEXT: s_setpc_b64
455
456; NO-D16-HI: flat_load_ubyte v{{[0-9]+}}
457; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
458; GFX803: v_or_b32_sdwa
459; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16,
460define void @load_flat_hi_v2f16_reglo_vreg_zexti8(i8* %in, half %reg) #0 {
461entry:
462  %load = load i8, i8* %in
463  %ext = zext i8 %load to i16
464  %bitcast = bitcast i16 %ext to half
465  %build0 = insertelement <2 x half> undef, half %reg, i32 0
466  %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1
467  store <2 x half> %build1, <2 x half> addrspace(1)* undef
468  ret void
469}
470
471; GCN-LABEL: {{^}}load_flat_hi_v2f16_reglo_vreg_sexti8:
472; GCN: s_waitcnt
473; GFX900-NEXT: flat_load_sbyte_d16_hi v2, v[0:1]
474; GFX900-NEXT: s_waitcnt
475; GFX900-NEXT: global_store_dword v[0:1], v2
476; GFX900-NEXT: s_waitcnt
477; GFX900-NEXT: s_setpc_b64
478
479; NO-D16-HI: flat_load_sbyte v{{[0-9]+}}
480; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
481; GFX803: v_or_b32_sdwa
482; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16,
483define void @load_flat_hi_v2f16_reglo_vreg_sexti8(i8* %in, half %reg) #0 {
484entry:
485  %load = load i8, i8* %in
486  %ext = sext i8 %load to i16
487  %bitcast = bitcast i16 %ext to half
488  %build0 = insertelement <2 x half> undef, half %reg, i32 0
489  %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1
490  store <2 x half> %build1, <2 x half> addrspace(1)* undef
491  ret void
492}
493
494; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg:
495; GCN: s_waitcnt
496; GFX900: buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:4094{{$}}
497; GFX900-NEXT: s_waitcnt
498; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
499; GFX900-NEXT: s_waitcnt
500; GFX900-NEXT: s_setpc_b64
501
502; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s32 offset:4094{{$}}
503define void @load_private_hi_v2i16_reglo_vreg(i16 addrspace(5)* byval %in, i16 %reg) #0 {
504entry:
505  %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i64 2047
506  %load = load i16, i16 addrspace(5)* %gep
507  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
508  %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
509  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
510  ret void
511}
512
513; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg:
514; GCN: s_waitcnt
515; GFX900: buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:4094{{$}}
516; GFX900-NEXT: s_waitcnt
517; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
518; GFX900-NEXT: s_waitcnt
519; GFX900-NEXT: s_setpc_b64
520
521; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s32 offset:4094{{$}}
522define void @load_private_hi_v2f16_reglo_vreg(half addrspace(5)* byval %in, half %reg) #0 {
523entry:
524  %gep = getelementptr inbounds half, half addrspace(5)* %in, i64 2047
525  %load = load half, half addrspace(5)* %gep
526  %build0 = insertelement <2 x half> undef, half %reg, i32 0
527  %build1 = insertelement <2 x half> %build0, half %load, i32 1
528  store <2 x half> %build1, <2 x half> addrspace(1)* undef
529  ret void
530}
531
532; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff:
533; GCN: s_waitcnt
534; GFX900: buffer_load_short_d16_hi v0, off, s[0:3], 0 offset:4094{{$}}
535; GFX900: s_waitcnt
536; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
537; GFX900-NEXT: s_waitcnt
538; GFX900-NEXT: s_setpc_b64
539
540; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], 0 offset:4094{{$}}
541define void @load_private_hi_v2i16_reglo_vreg_nooff(i16 addrspace(5)* byval %in, i16 %reg) #0 {
542entry:
543  %load = load volatile i16, i16 addrspace(5)* inttoptr (i32 4094 to i16 addrspace(5)*)
544  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
545  %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
546  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
547  ret void
548}
549
550; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_nooff:
551; GCN: s_waitcnt
552; GFX900-NEXT: buffer_load_short_d16_hi v1, off, s[0:3], 0 offset:4094{{$}}
553; GFX900-NEXT: s_waitcnt
554; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
555; GFX900-NEXT: s_waitcnt
556; GFX900-NEXT: s_setpc_b64
557
558; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], 0 offset:4094{{$}}
559define void @load_private_hi_v2f16_reglo_vreg_nooff(half addrspace(5)* %in, half %reg) #0 {
560entry:
561  %load = load volatile half, half addrspace(5)* inttoptr (i32 4094 to half addrspace(5)*)
562  %build0 = insertelement <2 x half> undef, half %reg, i32 0
563  %build1 = insertelement <2 x half> %build0, half %load, i32 1
564  store <2 x half> %build1, <2 x half> addrspace(1)* undef
565  ret void
566}
567
568; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_zexti8:
569; GCN: s_waitcnt
570; GFX900: buffer_load_ubyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}}
571; GFX900-NEXT: s_waitcnt
572; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
573; GFX900-NEXT: s_waitcnt
574; GFX900-NEXT: s_setpc_b64
575
576; NO-D16-HI: buffer_load_ubyte v{{[0-9]+}}, off, s[0:3], s32 offset:4095{{$}}
577define void @load_private_hi_v2i16_reglo_vreg_zexti8(i8 addrspace(5)* byval %in, i16 %reg) #0 {
578entry:
579  %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095
580  %load = load i8, i8 addrspace(5)* %gep
581  %ext = zext i8 %load to i16
582  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
583  %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
584  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
585  ret void
586}
587
588; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_zexti8:
589; GCN: s_waitcnt
590; GFX900: buffer_load_ubyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}}
591; GFX900-NEXT: s_waitcnt
592; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
593; GFX900-NEXT: s_waitcnt
594; GFX900-NEXT: s_setpc_b64
595
596; NO-D16-HI: buffer_load_ubyte v{{[0-9]+}}, off, s[0:3], s32 offset:4095{{$}}
597define void @load_private_hi_v2f16_reglo_vreg_zexti8(i8 addrspace(5)* byval %in, half %reg) #0 {
598entry:
599  %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095
600  %load = load i8, i8 addrspace(5)* %gep
601  %ext = zext i8 %load to i16
602  %bitcast = bitcast i16 %ext to half
603  %build0 = insertelement <2 x half> undef, half %reg, i32 0
604  %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1
605  store <2 x half> %build1, <2 x half> addrspace(1)* undef
606  ret void
607}
608
609; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_sexti8:
610; GCN: s_waitcnt
611; GFX900: buffer_load_sbyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}}
612; GFX900-NEXT: s_waitcnt
613; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
614; GFX900-NEXT: s_waitcnt
615; GFX900-NEXT: s_setpc_b64
616
617; NO-D16-HI: buffer_load_sbyte v{{[0-9]+}}, off, s[0:3], s32 offset:4095{{$}}
618define void @load_private_hi_v2f16_reglo_vreg_sexti8(i8 addrspace(5)* byval %in, half %reg) #0 {
619entry:
620  %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095
621  %load = load i8, i8 addrspace(5)* %gep
622  %ext = sext i8 %load to i16
623  %bitcast = bitcast i16 %ext to half
624  %build0 = insertelement <2 x half> undef, half %reg, i32 0
625  %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1
626  store <2 x half> %build1, <2 x half> addrspace(1)* undef
627  ret void
628}
629
630; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_sexti8:
631; GCN: s_waitcnt
632; GFX900: buffer_load_sbyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}}
633; GFX900-NEXT: s_waitcnt
634; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
635; GFX900-NEXT: s_waitcnt
636; GFX900-NEXT: s_setpc_b64
637
638; NO-D16-HI: buffer_load_sbyte v{{[0-9]+}}, off, s[0:3], s32 offset:4095{{$}}
639define void @load_private_hi_v2i16_reglo_vreg_sexti8(i8 addrspace(5)* byval %in, i16 %reg) #0 {
640entry:
641  %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095
642  %load = load i8, i8 addrspace(5)* %gep
643  %ext = sext i8 %load to i16
644  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
645  %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
646  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
647  ret void
648}
649
650; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff_zexti8:
651; GCN: s_waitcnt
652; GFX900-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], 0 offset:4094{{$}}
653; GFX900-NEXT: s_waitcnt
654; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
655; GFX900-NEXT: s_waitcnt
656; GFX900-NEXT: s_setpc_b64
657
658; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094{{$}}
659define void @load_private_hi_v2i16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, i16 %reg) #0 {
660entry:
661  %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*)
662  %ext = zext i8 %load to i16
663  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
664  %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
665  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
666  ret void
667}
668
669; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff_sexti8:
670; GCN: s_waitcnt
671; GFX900-NEXT: buffer_load_sbyte_d16_hi v1, off, s[0:3], 0 offset:4094{{$}}
672; GFX900-NEXT: s_waitcnt
673; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
674; GFX900-NEXT: s_waitcnt
675; GFX900-NEXT: s_setpc_b64
676
677; NO-D16-HI: buffer_load_sbyte v0, off, s[0:3], 0 offset:4094{{$}}
678define void @load_private_hi_v2i16_reglo_vreg_nooff_sexti8(i8 addrspace(5)* %in, i16 %reg) #0 {
679entry:
680  %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*)
681  %ext = sext i8 %load to i16
682  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
683  %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
684  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
685  ret void
686}
687
688; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_nooff_zexti8:
689; GCN: s_waitcnt
690; GFX900-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], 0 offset:4094{{$}}
691; GFX900-NEXT: s_waitcnt
692; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
693; GFX900-NEXT: s_waitcnt
694; GFX900-NEXT: s_setpc_b64
695
696; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094{{$}}
697define void @load_private_hi_v2f16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, half %reg) #0 {
698entry:
699  %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*)
700  %ext = zext i8 %load to i16
701  %bc.ext = bitcast i16 %ext to half
702  %build0 = insertelement <2 x half> undef, half %reg, i32 0
703  %build1 = insertelement <2 x half> %build0, half %bc.ext, i32 1
704  store <2 x half> %build1, <2 x half> addrspace(1)* undef
705  ret void
706}
707
708; GCN-LABEL: {{^}}load_constant_hi_v2i16_reglo_vreg:
709; GCN: s_waitcnt
710; GFX900-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094
711; GFX900-NEXT: s_waitcnt
712; GFX900-NEXT: global_store_dword
713; GFX900-NEXT: s_waitcnt
714; GFX900-NEXT: s_setpc_b64
715
716; GFX803: flat_load_ushort
717; GFX906: global_load_ushort
718define void @load_constant_hi_v2i16_reglo_vreg(i16 addrspace(4)* %in, i16 %reg) #0 {
719entry:
720  %gep = getelementptr inbounds i16, i16 addrspace(4)* %in, i64 -2047
721  %load = load i16, i16 addrspace(4)* %gep
722  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
723  %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
724  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
725  ret void
726}
727
728; GCN-LABEL: load_constant_hi_v2f16_reglo_vreg
729; GCN: s_waitcnt
730; GFX900-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094
731; GFX900-NEXT: s_waitcnt
732; GFX900-NEXT: global_store_dword
733; GFX900-NEXT: s_waitcnt
734; GFX900-NEXT: s_setpc_b64
735
736; GFX803: flat_load_ushort
737; GFX906: global_load_ushort
738define void @load_constant_hi_v2f16_reglo_vreg(half addrspace(4)* %in, half %reg) #0 {
739entry:
740  %gep = getelementptr inbounds half, half addrspace(4)* %in, i64 -2047
741  %load = load half, half addrspace(4)* %gep
742  %build0 = insertelement <2 x half> undef, half %reg, i32 0
743  %build1 = insertelement <2 x half> %build0, half %load, i32 1
744  store <2 x half> %build1, <2 x half> addrspace(1)* undef
745  ret void
746}
747
748; GCN-LABEL: {{^}}load_constant_hi_v2f16_reglo_vreg_sexti8:
749; GCN: s_waitcnt
750; GFX900-NEXT: global_load_sbyte_d16_hi v2, v[0:1], off offset:-4095
751; GFX900-NEXT: s_waitcnt
752; GFX900-NEXT: global_store_dword
753; GFX900-NEXT: s_waitcnt
754; GFX900-NEXT: s_setpc_b64
755define void @load_constant_hi_v2f16_reglo_vreg_sexti8(i8 addrspace(4)* %in, half %reg) #0 {
756entry:
757  %gep = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 -4095
758  %load = load i8, i8 addrspace(4)* %gep
759  %ext = sext i8 %load to i16
760  %bitcast = bitcast i16 %ext to half
761  %build0 = insertelement <2 x half> undef, half %reg, i32 0
762  %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1
763  store <2 x half> %build1, <2 x half> addrspace(1)* undef
764  ret void
765}
766
767; GCN-LABEL: {{^}}load_constant_hi_v2f16_reglo_vreg_zexti8:
768; GCN: s_waitcnt
769; GFX900-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:-4095
770; GFX900-NEXT: s_waitcnt
771; GFX900-NEXT: global_store_dword
772; GFX900-NEXT: s_waitcnt
773; GFX900-NEXT: s_setpc_b64
774define void @load_constant_hi_v2f16_reglo_vreg_zexti8(i8 addrspace(4)* %in, half %reg) #0 {
775entry:
776  %gep = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 -4095
777  %load = load i8, i8 addrspace(4)* %gep
778  %ext = zext i8 %load to i16
779  %bitcast = bitcast i16 %ext to half
780  %build0 = insertelement <2 x half> undef, half %reg, i32 0
781  %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1
782  store <2 x half> %build1, <2 x half> addrspace(1)* undef
783  ret void
784}
785
786; Local object gives known offset, so requires converting from offen
787; to offset variant.
788
789; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_to_offset:
790; GFX900: buffer_store_dword
791; GFX900-NEXT: buffer_load_short_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4094
792define void @load_private_hi_v2i16_reglo_vreg_to_offset(i16 %reg) #0 {
793entry:
794  %obj0 = alloca [10 x i32], align 4, addrspace(5)
795  %obj1 = alloca [4096 x i16], align 2, addrspace(5)
796  %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
797  store volatile i32 123, i32 addrspace(5)* %bc
798  %gep = getelementptr inbounds [4096 x i16], [4096 x i16] addrspace(5)* %obj1, i32 0, i32 2027
799  %load = load i16, i16 addrspace(5)* %gep
800  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
801  %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
802  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
803  ret void
804}
805
806; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_sexti8_to_offset:
807; GFX900: buffer_store_dword
808; GFX900-NEXT: buffer_load_sbyte_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4095
809define void @load_private_hi_v2i16_reglo_vreg_sexti8_to_offset(i16 %reg) #0 {
810entry:
811  %obj0 = alloca [10 x i32], align 4, addrspace(5)
812  %obj1 = alloca [4096 x i8], align 2, addrspace(5)
813  %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
814  store volatile i32 123, i32 addrspace(5)* %bc
815  %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4055
816  %load = load i8, i8 addrspace(5)* %gep
817  %ext = sext i8 %load to i16
818  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
819  %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
820  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
821  ret void
822}
823
824; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_zexti8_to_offset:
825; GFX900: buffer_store_dword
826; GFX900-NEXT: buffer_load_ubyte_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4095
827define void @load_private_hi_v2i16_reglo_vreg_zexti8_to_offset(i16 %reg) #0 {
828entry:
829  %obj0 = alloca [10 x i32], align 4, addrspace(5)
830  %obj1 = alloca [4096 x i8], align 2, addrspace(5)
831  %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
832  store volatile i32 123, i32 addrspace(5)* %bc
833  %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4055
834  %load = load i8, i8 addrspace(5)* %gep
835  %ext = zext i8 %load to i16
836  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
837  %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
838  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
839  ret void
840}
841
842; FIXME: Remove m0 init and waitcnt between reads
843; FIXME: Is there a cost to using the extload over not?
844; GCN-LABEL: {{^}}load_local_v2i16_split_multi_chain:
845; GCN: s_waitcnt
846; GFX900-NEXT: ds_read_u16 v1, v0
847; GFX900-NEXT: s_waitcnt
848; GFX900-NEXT: ds_read_u16_d16_hi v1, v0 offset:2
849; GFX900-NEXT: s_waitcnt
850; GFX900-NEXT: v_mov_b32_e32 v0, v1
851; GFX900-NEXT: s_setpc_b64
852define <2 x i16> @load_local_v2i16_split_multi_chain(i16 addrspace(3)* %in) #0 {
853entry:
854  %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 1
855  %load0 = load volatile i16, i16 addrspace(3)* %in
856  %load1 = load volatile i16, i16 addrspace(3)* %gep
857  %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0
858  %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1
859  ret <2 x i16> %build1
860}
861
862; GCN-LABEL: {{^}}load_local_lo_hi_v2i16_samechain:
863; GFX900: ds_read_u16 v1, v0
864; GFX900-NEXT: s_waitcnt lgkmcnt(0)
865; GFX900-NEXT: ds_read_u16_d16_hi v1, v0 offset:16
866; GFX900-NEXT: s_waitcnt lgkmcnt(0)
867; GFX900-NEXT: v_mov_b32_e32 v0, v1
868; GFX900-NEXT: s_setpc_b64
869
870; NO-D16-HI: ds_read_u16
871; NO-D16-HI: ds_read_u16
872define <2 x i16> @load_local_lo_hi_v2i16_samechain(i16 addrspace(3)* %in) #0 {
873entry:
874  %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 8
875  %load.lo = load i16, i16 addrspace(3)* %in
876  %load.hi = load i16, i16 addrspace(3)* %gep
877  %build0 = insertelement <2 x i16> undef, i16 %load.lo, i32 0
878  %build1 = insertelement <2 x i16> %build0, i16 %load.hi, i32 1
879  ret <2 x i16> %build1
880}
881
882; FIXME: Remove and
883; GCN-LABEL: {{^}}load_local_v2i16_broadcast:
884; GCN: ds_read_u16 [[LOAD:v[0-9]+]]
885; GCN-NOT: ds_read
886; GFX9: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, [[LOAD]]
887; GFX9: v_lshl_or_b32 v0, [[LOAD]], 16, [[AND]]
888define <2 x i16> @load_local_v2i16_broadcast(i16 addrspace(3)* %in) #0 {
889entry:
890  %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 1
891  %load0 = load i16, i16 addrspace(3)* %in
892  %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0
893  %build1 = insertelement <2 x i16> %build0, i16 %load0, i32 1
894  ret <2 x i16> %build1
895}
896
897; GCN-LABEL: {{^}}load_local_lo_hi_v2i16_side_effect:
898; GFX900: ds_read_u16 [[LOAD0:v[0-9]+]], v0
899; GFX900: ds_write_b16
900; GFX900: ds_read_u16_d16_hi [[LOAD0]], v0 offset:16
901
902; NO-D16-HI: ds_read_u16
903; NO-D16-HI: ds_write_b16
904; NO-D16-HI: ds_read_u16
905define <2 x i16> @load_local_lo_hi_v2i16_side_effect(i16 addrspace(3)* %in, i16 addrspace(3)* %may.alias) #0 {
906entry:
907  %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 8
908  %load.lo = load i16, i16 addrspace(3)* %in
909  store i16 123, i16 addrspace(3)* %may.alias
910  %load.hi = load i16, i16 addrspace(3)* %gep
911  %build0 = insertelement <2 x i16> undef, i16 %load.lo, i32 0
912  %build1 = insertelement <2 x i16> %build0, i16 %load.hi, i32 1
913  ret <2 x i16> %build1
914}
915
916; FIXME: Remove waitcnt between reads
917; GCN-LABEL: {{^}}load_global_v2i16_split:
918; GCN: s_waitcnt
919; GFX900-NEXT: global_load_ushort v2
920; GFX900-NEXT: s_waitcnt
921; GFX900-NEXT: global_load_short_d16_hi v2
922; GFX900-NEXT: s_waitcnt
923; GFX900-NEXT: v_mov_b32_e32 v0, v2
924; GFX900-NEXT: s_setpc_b64
925define <2 x i16> @load_global_v2i16_split(i16 addrspace(1)* %in) #0 {
926entry:
927  %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 1
928  %load0 = load volatile i16, i16 addrspace(1)* %in
929  %load1 = load volatile i16, i16 addrspace(1)* %gep
930  %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0
931  %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1
932  ret <2 x i16> %build1
933}
934
935; FIXME: Remove waitcnt between reads
936; GCN-LABEL: {{^}}load_flat_v2i16_split:
937; GCN: s_waitcnt
938; GFX900-NEXT: flat_load_ushort v2
939; GFX900-NEXT: s_waitcnt
940; GFX900-NEXT: flat_load_short_d16_hi v2
941; GFX900-NEXT: s_waitcnt
942; GFX900-NEXT: v_mov_b32_e32 v0, v2
943; GFX900-NEXT: s_setpc_b64
944define <2 x i16> @load_flat_v2i16_split(i16* %in) #0 {
945entry:
946  %gep = getelementptr inbounds i16, i16* %in, i64 1
947  %load0 = load volatile i16, i16* %in
948  %load1 = load volatile i16, i16* %gep
949  %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0
950  %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1
951  ret <2 x i16> %build1
952}
953
954; FIXME: Remove waitcnt between reads
955; GCN-LABEL: {{^}}load_constant_v2i16_split:
956; GCN: s_waitcnt
957; GFX900-NEXT: global_load_ushort v2
958; GFX900-NEXT: s_waitcnt
959; GFX900-NEXT: global_load_short_d16_hi v2
960; GFX900-NEXT: s_waitcnt
961; GFX900-NEXT: v_mov_b32_e32 v0, v2
962; GFX900-NEXT: s_setpc_b64
963define <2 x i16> @load_constant_v2i16_split(i16 addrspace(4)* %in) #0 {
964entry:
965  %gep = getelementptr inbounds i16, i16 addrspace(4)* %in, i64 1
966  %load0 = load volatile i16, i16 addrspace(4)* %in
967  %load1 = load volatile i16, i16 addrspace(4)* %gep
968  %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0
969  %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1
970  ret <2 x i16> %build1
971}
972
973; FIXME: Remove m0 init and waitcnt between reads
974; FIXME: Is there a cost to using the extload over not?
975; GCN-LABEL: {{^}}load_private_v2i16_split:
976; GCN: s_waitcnt
977; GFX900: buffer_load_ushort v0, off, s[0:3], s32{{$}}
978; GFX900-NEXT: s_waitcnt
979; GFX900-NEXT: buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:2
980; GFX900-NEXT: s_waitcnt
981; GFX900-NEXT: s_setpc_b64
982define <2 x i16> @load_private_v2i16_split(i16 addrspace(5)* byval %in) #0 {
983entry:
984  %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i32 1
985  %load0 = load volatile i16, i16 addrspace(5)* %in
986  %load1 = load volatile i16, i16 addrspace(5)* %gep
987  %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0
988  %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1
989  ret <2 x i16> %build1
990}
991
992; FIXME: This test should work without copying of v0.
993;        ds_read_u16_d16_hi preserves low 16 bits of the destination
994;        and ds_write_b16 only reads low 16 bits.
995; GCN: s_waitcnt
996; GFX900:      v_mov_b32_e32 [[COPY:v[0-9]+]], v0
997; GFX900-NEXT: ds_read_u16_d16_hi [[COPY]], v1
998; GFX900-NEXT: ds_write_b16 v1, v0
999; GFX900-NEXT: s_waitcnt
1000; GFX900-NEXT: v_mov_b32_e32 v0, [[COPY]]
1001; GFX900-NEXT: s_waitcnt
1002; GFX900-NEXT: s_setpc_b64
1003define <2 x i16> @load_local_hi_v2i16_store_local_lo(i16 %reg, i16 addrspace(3)* %in) #0 {
1004entry:
1005  %load = load i16, i16 addrspace(3)* %in
1006  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
1007  %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
1008  store volatile i16 %reg, i16 addrspace(3)* %in
1009  ret <2 x i16> %build1
1010}
1011
1012attributes #0 = { nounwind }
1013