1; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900 %s
2; RUN: llc -march=amdgcn -mcpu=gfx906 -amdgpu-sroa=0 -mattr=-promote-alloca,+sram-ecc -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX906,NO-D16-HI %s
3; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX803,NO-D16-HI %s
4
5; GCN-LABEL: {{^}}load_local_lo_v2i16_undeflo:
6; GCN: s_waitcnt
7; GFX900-NEXT: ds_read_u16_d16 v0, v0
8; GFX900-NEXT: s_waitcnt
9; GFX900-NEXT: s_setpc_b64
10
11; NO-D16-HI: ds_read_u16
12define <2 x i16> @load_local_lo_v2i16_undeflo(i16 addrspace(3)* %in) #0 {
13entry:
14  %load = load i16, i16 addrspace(3)* %in
15  %build = insertelement <2 x i16> undef, i16 %load, i32 0
16  ret <2 x i16> %build
17}
18
19; GCN-LABEL: {{^}}load_local_lo_v2i16_reglo:
20; GCN: s_waitcnt
21; GCN: ds_read_u16 v0, v0
22; GFX9: v_and_b32_e32 v0, 0xffff, v0
23; GFX9: v_lshl_or_b32 v0, v1, 16, v0
24; GFX9: s_setpc_b64
25define <2 x i16> @load_local_lo_v2i16_reglo(i16 addrspace(3)* %in, i16 %reg) #0 {
26entry:
27  %load = load i16, i16 addrspace(3)* %in
28  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 1
29  %build1 = insertelement <2 x i16> %build0, i16 %load, i32 0
30  ret <2 x i16> %build1
31}
32
33; Show that we get reasonable regalloc without physreg constraints.
34; GCN-LABEL: {{^}}load_local_lo_v2i16_reglo_vreg:
35; GCN: s_waitcnt
36; GCN: ds_read_u16 v0, v0
37; GCN: s_waitcnt
38; GFX9: v_and_b32_e32 v0, 0xffff, v0
39; GFX9: v_lshl_or_b32 v0, v1, 16, v0
40; GFX9: global_store_dword v
41define void @load_local_lo_v2i16_reglo_vreg(i16 addrspace(3)* %in, i16 %reg) #0 {
42entry:
43  %load = load i16, i16 addrspace(3)* %in
44  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 1
45  %build1 = insertelement <2 x i16> %build0, i16 %load, i32 0
46  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
47  ret void
48}
49
50; GCN-LABEL: {{^}}load_local_lo_v2i16_zerolo:
51; GCN: s_waitcnt
52; GFX900-NEXT: v_mov_b32_e32 v1, 0
53; GFX900-NEXT: ds_read_u16_d16 v1, v0
54; GFX900-NEXT: s_waitcnt
55; GFX900-NEXT: v_mov_b32_e32 v0, v1
56; GFX900-NEXT: s_setpc_b64
57
58; NO-D16-HI: ds_read_u16 v
59define <2 x i16> @load_local_lo_v2i16_zerolo(i16 addrspace(3)* %in) #0 {
60entry:
61  %load = load i16, i16 addrspace(3)* %in
62  %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 0
63  ret <2 x i16> %build
64}
65
66; GCN-LABEL: {{^}}load_local_lo_v2f16_fpimm:
67; GCN: s_waitcnt
68; GFX900-NEXT: v_mov_b32_e32 v1, 2.0
69; GFX900-NEXT: ds_read_u16_d16 v1, v0
70; GFX900-NEXT: s_waitcnt
71; GFX900-NEXT: v_mov_b32_e32 v0, v1
72; GFX900-NEXT: s_setpc_b64
73
74; NO-D16-HI: ds_read_u16 v
75define <2 x half> @load_local_lo_v2f16_fpimm(half addrspace(3)* %in) #0 {
76entry:
77  %load = load half, half addrspace(3)* %in
78  %build = insertelement <2 x half> <half 0.0, half 2.0>, half %load, i32 0
79  ret <2 x half> %build
80}
81
82; GCN-LABEL: {{^}}load_local_lo_v2f16_reghi_vreg:
83; GCN: s_waitcnt
84; GFX900-NEXT: ds_read_u16_d16 v1, v0
85; GFX900-NEXT: s_waitcnt
86; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}}
87; GFX900-NEXT: s_waitcnt
88; GFX900-NEXT: s_setpc_b64
89
90; NO-D16-HI: ds_read_u16 v
91define void @load_local_lo_v2f16_reghi_vreg(half addrspace(3)* %in, i32 %reg) #0 {
92entry:
93  %reg.bc = bitcast i32 %reg to <2 x half>
94  %load = load half, half addrspace(3)* %in
95  %build1 = insertelement <2 x half> %reg.bc, half %load, i32 0
96  store <2 x half> %build1, <2 x half> addrspace(1)* undef
97  ret void
98}
99; GCN-LABEL: {{^}}load_local_lo_v2f16_reglo_vreg:
100
101; GFX900: ds_read_u16 v
102; GFX900: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}}
103; GFX900: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16, v{{[0-9]+}}
104; GFX900: global_store_dword
105
106; NO-D16-HI: ds_read_u16 v
107define void @load_local_lo_v2f16_reglo_vreg(half addrspace(3)* %in, half %reg) #0 {
108entry:
109  %load = load half, half addrspace(3)* %in
110  %build0 = insertelement <2 x half> undef, half %reg, i32 1
111  %build1 = insertelement <2 x half> %build0, half %load, i32 0
112  store <2 x half> %build1, <2 x half> addrspace(1)* undef
113  ret void
114}
115
116; GCN-LABEL: {{^}}load_local_lo_v2i16_reghi_vreg_zexti8:
117; GCN: s_waitcnt
118; GFX900-NEXT: ds_read_u8_d16 v1, v0
119; GFX900-NEXT: s_waitcnt
120; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}}
121; GFX900-NEXT: s_waitcnt
122; GFX900-NEXT: s_setpc_b64
123
124; NO-D16-HI: ds_read_u8 v
125define void @load_local_lo_v2i16_reghi_vreg_zexti8(i8 addrspace(3)* %in, i32 %reg) #0 {
126entry:
127  %reg.bc = bitcast i32 %reg to <2 x i16>
128  %load = load i8, i8 addrspace(3)* %in
129  %ext = zext i8 %load to i16
130  %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
131  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
132  ret void
133}
134
135; GCN-LABEL: {{^}}load_local_lo_v2i16_reglo_vreg_zexti8:
136; GCN: s_waitcnt
137; GFX900: ds_read_u8 v
138; GFX900: global_store_dword
139; GFX900-NEXT: s_waitcnt
140; GFX900-NEXT: s_setpc_b64
141
142; NO-D16-HI: ds_read_u8 v
143define void @load_local_lo_v2i16_reglo_vreg_zexti8(i8 addrspace(3)* %in, i16 %reg) #0 {
144entry:
145  %load = load i8, i8 addrspace(3)* %in
146  %ext = zext i8 %load to i16
147  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 1
148  %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 0
149  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
150  ret void
151}
152
153; GCN-LABEL: {{^}}load_local_lo_v2i16_reghi_vreg_sexti8:
154; GCN: s_waitcnt
155; GFX900-NEXT: ds_read_i8_d16 v1, v0
156; GFX900-NEXT: s_waitcnt
157; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}}
158; GFX900-NEXT: s_waitcnt
159; GFX900-NEXT: s_setpc_b64
160
161; NO-D16-HI: ds_read_i8 v
162define void @load_local_lo_v2i16_reghi_vreg_sexti8(i8 addrspace(3)* %in, i32 %reg) #0 {
163entry:
164  %reg.bc = bitcast i32 %reg to <2 x i16>
165  %load = load i8, i8 addrspace(3)* %in
166  %ext = sext i8 %load to i16
167  %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
168  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
169  ret void
170}
171
172; GCN-LABEL: {{^}}load_local_lo_v2i16_reglo_vreg_sexti8:
173; GCN: s_waitcnt
174; GFX900: ds_read_i8 v
175; GFX900: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}}
176; GFX900: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16, v{{[0-9]+}}
177
178; NO-D16-HI: ds_read_i8 v
179define void @load_local_lo_v2i16_reglo_vreg_sexti8(i8 addrspace(3)* %in, i16 %reg) #0 {
180entry:
181  %load = load i8, i8 addrspace(3)* %in
182  %ext = sext i8 %load to i16
183  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 1
184  %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 0
185  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
186  ret void
187}
188
189; GCN-LABEL: {{^}}load_local_lo_v2f16_reglo_vreg_zexti8:
190; GCN: s_waitcnt
191; GFX900: ds_read_u8 v
192; GFX900: global_store_dword
193; GFX900-NEXT: s_waitcnt
194; GFX900-NEXT: s_setpc_b64
195
196; NO-D16-HI: ds_read_u8 v
197define void @load_local_lo_v2f16_reglo_vreg_zexti8(i8 addrspace(3)* %in, half %reg) #0 {
198entry:
199  %load = load i8, i8 addrspace(3)* %in
200  %ext = zext i8 %load to i16
201  %bitcast = bitcast i16 %ext to half
202  %build0 = insertelement <2 x half> undef, half %reg, i32 1
203  %build1 = insertelement <2 x half> %build0, half %bitcast, i32 0
204  store <2 x half> %build1, <2 x half> addrspace(1)* undef
205  ret void
206}
207
208; GCN-LABEL: {{^}}load_local_lo_v2f16_reglo_vreg_sexti8:
209; GCN: s_waitcnt
210; GFX900: ds_read_i8 v
211; GFX900: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}}
212; GFX900: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16, v{{[0-9]+}}
213
214; NO-D16-HI: ds_read_i8 v
215define void @load_local_lo_v2f16_reglo_vreg_sexti8(i8 addrspace(3)* %in, half %reg) #0 {
216entry:
217  %load = load i8, i8 addrspace(3)* %in
218  %ext = sext i8 %load to i16
219  %bitcast = bitcast i16 %ext to half
220  %build0 = insertelement <2 x half> undef, half %reg, i32 1
221  %build1 = insertelement <2 x half> %build0, half %bitcast, i32 0
222  store <2 x half> %build1, <2 x half> addrspace(1)* undef
223  ret void
224}
225
226; GCN-LABEL: {{^}}load_local_lo_v2i16_reghi_vreg_multi_use_lo:
227; GFX900: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
228; GFX900: ds_read_u16 v0, v0
229; GFX900: v_mov_b32_e32 v3, 0
230; GFX900: v_mov_b32_e32 v2, 0xffff
231; GFX900: s_waitcnt lgkmcnt(0)
232; GFX900: ds_write_b16 v3, v0
233; GFX900: v_bfi_b32 v0, v2, v0, v1
234; GFX900: global_store_dword v[0:1], v0, off
235; GFX900: s_waitcnt vmcnt(0) lgkmcnt(0)
236; GFX900: s_setpc_b64 s[30:31]
237
238; NO-D16-HI: ds_read_u16 v
239define void @load_local_lo_v2i16_reghi_vreg_multi_use_lo(i16 addrspace(3)* %in, <2 x i16> %reg) #0 {
240entry:
241  %load = load i16, i16 addrspace(3)* %in
242  %elt1 = extractelement <2 x i16> %reg, i32 1
243  store i16 %load, i16 addrspace(3)* null
244  %build1 = insertelement <2 x i16> %reg, i16 %load, i32 0
245  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
246  ret void
247}
248
249; GCN-LABEL: {{^}}load_local_lo_v2i16_reghi_vreg_multi_use_hi:
250; GFX900: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
251; GFX900: v_lshrrev_b32_e32 v2, 16, v1
252; GFX900: ds_read_u16_d16 v1, v0
253; GFX900: v_mov_b32_e32 v0, 0
254; GFX900: ds_write_b16 v0, v2
255; GFX900: s_waitcnt lgkmcnt(1)
256; GFX900: global_store_dword v[0:1], v1, off
257; GFX900: s_waitcnt vmcnt(0) lgkmcnt(0)
258; GFX900: s_setpc_b64 s[30:31]
259
260; NO-D16-HI: ds_read_u16 v
261define void @load_local_lo_v2i16_reghi_vreg_multi_use_hi(i16 addrspace(3)* %in, <2 x i16> %reg) #0 {
262entry:
263  %load = load i16, i16 addrspace(3)* %in
264  %elt1 = extractelement <2 x i16> %reg, i32 1
265  store i16 %elt1, i16 addrspace(3)* null
266  %build1 = insertelement <2 x i16> %reg, i16 %load, i32 0
267  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
268  ret void
269}
270
271; GCN-LABEL: {{^}}load_local_lo_v2i16_reghi_vreg_multi_use_lohi:
272; GFX900: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
273; GFX900: ds_read_u16 v0, v0
274; GFX900: v_lshrrev_b32_e32 v[[A_F16:[0-9]+]], 16, v1
275; GFX900: v_mov_b32_e32 v[[A_F32:[0-9]+]], 0xffff
276; GFX900: s_waitcnt lgkmcnt(0)
277; GFX900: ds_write_b16 v2, v0
278; GFX900: ds_write_b16 v3, v[[A_F16]]
279; GFX900: v_bfi_b32 v0, v[[A_F32]], v0, v1
280; GFX900: global_store_dword v[0:1], v0, off
281; GFX900: s_waitcnt vmcnt(0) lgkmcnt(0)
282; GFX900: s_setpc_b64 s[30:31]
283
284; NO-D16-HI: ds_read_u16 v
285define void @load_local_lo_v2i16_reghi_vreg_multi_use_lohi(i16 addrspace(3)* noalias %in, <2 x i16> %reg, i16 addrspace(3)* noalias %out0, i16 addrspace(3)* noalias %out1) #0 {
286entry:
287  %load = load i16, i16 addrspace(3)* %in
288  %elt1 = extractelement <2 x i16> %reg, i32 1
289  store i16 %load, i16 addrspace(3)* %out0
290  store i16 %elt1, i16 addrspace(3)* %out1
291  %build1 = insertelement <2 x i16> %reg, i16 %load, i32 0
292  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
293  ret void
294}
295
296; GCN-LABEL: {{^}}load_global_lo_v2i16_reglo_vreg:
297; GCN: s_waitcnt
298; GFX900-NEXT: global_load_short_d16 v2, v[0:1], off offset:-4094
299; GFX900-NEXT: s_waitcnt
300; GFX900-NEXT: global_store_dword
301; GFX900-NEXT: s_waitcnt
302; GFX900-NEXT: s_setpc_b64
303
304; GFX906: global_load_ushort v0, v[0:1], off offset:-4094
305; GFX906: v_bfi_b32
306define void @load_global_lo_v2i16_reglo_vreg(i16 addrspace(1)* %in, i32 %reg) #0 {
307entry:
308  %reg.bc = bitcast i32 %reg to <2 x i16>
309  %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 -2047
310  %load = load i16, i16 addrspace(1)* %gep
311  %build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0
312  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
313  ret void
314}
315
316; GCN-LABEL: {{^}}load_global_lo_v2f16_reglo_vreg:
317; GCN: s_waitcnt
318; GFX900-NEXT: global_load_short_d16 v2, v[0:1], off offset:-4094
319; GFX900-NEXT: s_waitcnt
320; GFX900-NEXT: global_store_dword
321; GFX900-NEXT: s_waitcnt
322; GFX900-NEXT: s_setpc_b64
323
324; GFX906: global_load_ushort v0, v[0:1], off offset:-4094
325; GFX906: v_lshrrev_b32
326; GFX906: v_and_b32_e32
327; GFX906: v_lshl_or_b32
328
329; GFX803: flat_load_ushort
330define void @load_global_lo_v2f16_reglo_vreg(half addrspace(1)* %in, i32 %reg) #0 {
331entry:
332  %reg.bc = bitcast i32 %reg to <2 x half>
333  %gep = getelementptr inbounds half, half addrspace(1)* %in, i64 -2047
334  %load = load half, half addrspace(1)* %gep
335  %build1 = insertelement <2 x half> %reg.bc, half %load, i32 0
336  store <2 x half> %build1, <2 x half> addrspace(1)* undef
337  ret void
338}
339
340; GCN-LABEL: {{^}}load_global_lo_v2i16_reglo_vreg_zexti8:
341; GCN: s_waitcnt
342; GFX900-NEXT: global_load_ubyte_d16 v2, v[0:1], off offset:-4095
343; GFX900-NEXT: s_waitcnt
344; GFX900-NEXT: global_store_dword
345; GFX900-NEXT: s_waitcnt
346; GFX900-NEXT: s_setpc_b64
347
348; GFX906: global_load_ubyte v0, v[0:1], off offset:-4095
349; GFX906: v_bfi_b32
350
351; GFX803: flat_load_ubyte
352define void @load_global_lo_v2i16_reglo_vreg_zexti8(i8 addrspace(1)* %in, i32 %reg) #0 {
353entry:
354  %reg.bc = bitcast i32 %reg to <2 x i16>
355  %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095
356  %load = load i8, i8 addrspace(1)* %gep
357  %ext = zext i8 %load to i16
358  %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
359  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
360  ret void
361}
362
363; GCN-LABEL: {{^}}load_global_lo_v2i16_reglo_vreg_sexti8:
364; GCN: s_waitcnt
365; GFX900-NEXT: global_load_sbyte_d16 v2, v[0:1], off offset:-4095
366; GFX900-NEXT: s_waitcnt
367; GFX900-NEXT: global_store_dword
368; GFX900-NEXT: s_waitcnt
369; GFX900-NEXT: s_setpc_b64
370
371; GFX906: global_load_sbyte v0, v[0:1], off offset:-4095
372; GFX906: v_bfi_b32
373
374; GFX803: flat_load_sbyte
375define void @load_global_lo_v2i16_reglo_vreg_sexti8(i8 addrspace(1)* %in, i32 %reg) #0 {
376entry:
377  %reg.bc = bitcast i32 %reg to <2 x i16>
378  %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095
379  %load = load i8, i8 addrspace(1)* %gep
380  %ext = sext i8 %load to i16
381  %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
382  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
383  ret void
384}
385
386; GCN-LABEL: {{^}}load_global_lo_v2f16_reglo_vreg_zexti8:
387; GCN: s_waitcnt
388; GFX900-NEXT: global_load_ubyte_d16 v2, v[0:1], off offset:-4095
389; GFX900-NEXT: s_waitcnt
390; GFX900-NEXT: global_store_dword
391; GFX900-NEXT: s_waitcnt
392; GFX900-NEXT: s_setpc_b64
393
394; GFX906: global_load_ubyte v0, v[0:1], off offset:-4095
395; GFX906: v_and_b32_e32
396; GFX906: v_lshl_or_b32
397
398; GFX803: flat_load_ubyte
399define void @load_global_lo_v2f16_reglo_vreg_zexti8(i8 addrspace(1)* %in, i32 %reg) #0 {
400entry:
401  %reg.bc = bitcast i32 %reg to <2 x half>
402  %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095
403  %load = load i8, i8 addrspace(1)* %gep
404  %ext = zext i8 %load to i16
405  %bitcast = bitcast i16 %ext to half
406  %build1 = insertelement <2 x half> %reg.bc, half %bitcast, i32 0
407  store <2 x half> %build1, <2 x half> addrspace(1)* undef
408  ret void
409}
410
411; GCN-LABEL: {{^}}load_global_lo_v2f16_reglo_vreg_sexti8:
412; GCN: s_waitcnt
413; GFX900-NEXT: global_load_sbyte_d16 v2, v[0:1], off offset:-4095
414; GFX900-NEXT: s_waitcnt
415; GFX900-NEXT: global_store_dword
416; GFX900-NEXT: s_waitcnt
417; GFX900-NEXT: s_setpc_b64
418
419; GFX906: global_load_sbyte v0, v[0:1], off offset:-4095
420; GFX906: v_lshrrev_b32
421; GFX906: v_and_b32
422; GFX906: v_lshl_or_b32
423
424; GFX803: flat_load_sbyte
425define void @load_global_lo_v2f16_reglo_vreg_sexti8(i8 addrspace(1)* %in, i32 %reg) #0 {
426entry:
427  %reg.bc = bitcast i32 %reg to <2 x half>
428  %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095
429  %load = load i8, i8 addrspace(1)* %gep
430  %ext = sext i8 %load to i16
431  %bitcast = bitcast i16 %ext to half
432  %build1 = insertelement <2 x half> %reg.bc, half %bitcast, i32 0
433  store <2 x half> %build1, <2 x half> addrspace(1)* undef
434  ret void
435}
436
437; GCN-LABEL: {{^}}load_flat_lo_v2i16_reghi_vreg:
438; GCN: s_waitcnt
439; GFX900-NEXT: flat_load_short_d16 v2, v[0:1]
440; GFX900-NEXT: s_waitcnt
441; GFX900-NEXT: global_store_dword v[0:1], v2
442; GFX900-NEXT: s_waitcnt
443; GFX900-NEXT: s_setpc_b64
444
445; GFX803: flat_load_ushort v{{[0-9]+}}
446; GFX803: v_or_b32_e32
447
448; GFX906: flat_load_ushort [[LOAD:v[0-9]+]]
449; GFX906: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff{{$}}
450; GFX906: v_bfi_b32 v{{[0-9]+}}, [[MASK]], [[LOAD]], v2
451; GFX906: global_store_dword
452define void @load_flat_lo_v2i16_reghi_vreg(i16* %in, i32 %reg) #0 {
453entry:
454  %reg.bc = bitcast i32 %reg to <2 x i16>
455  %load = load i16, i16* %in
456  %build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0
457  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
458  ret void
459}
460
461; GCN-LABEL: {{^}}load_flat_lo_v2f16_reghi_vreg:
462; GCN: s_waitcnt
463; GFX900-NEXT: flat_load_short_d16 v2, v[0:1]
464; GFX900-NEXT: s_waitcnt
465; GFX900-NEXT: global_store_dword v[0:1], v2
466; GFX900-NEXT: s_waitcnt
467; GFX900-NEXT: s_setpc_b64
468
469; GFX803: flat_load_ushort v{{[0-9]+}}
470; GFX803: v_or_b32_e32
471
472; FIXME: and should be removable
473; GFX906: flat_load_ushort [[LOAD:v[0-9]+]]
474; GFX906: v_lshrrev_b32_e32 [[SHR:v[0-9]+]], 16, v2
475; GFX906: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, [[LOAD]]
476; GFX906: v_lshl_or_b32 [[LSHL_OR:v[0-9]+]], [[SHR]], 16, [[AND]]
477; GFX906: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[LSHL_OR]]
478define void @load_flat_lo_v2f16_reghi_vreg(half* %in, i32 %reg) #0 {
479entry:
480  %reg.bc = bitcast i32 %reg to <2 x half>
481  %load = load half, half* %in
482  %build1 = insertelement <2 x half> %reg.bc, half %load, i32 0
483  store <2 x half> %build1, <2 x half> addrspace(1)* undef
484  ret void
485}
486
487; GCN-LABEL: {{^}}load_flat_lo_v2i16_reglo_vreg_zexti8:
488; GCN: s_waitcnt
489; GFX900-NEXT: flat_load_ubyte_d16 v2, v[0:1]
490; GFX900-NEXT: s_waitcnt
491; GFX900-NEXT: global_store_dword v[0:1], v2
492; GFX900-NEXT: s_waitcnt
493; GFX900-NEXT: s_setpc_b64
494
495; GFX803: flat_load_ubyte [[LO:v[0-9]+]]
496; GFX803: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, v2
497; GFX803: s_mov_b32 [[MASK:s[0-9]+]], 0x5040c00
498; GFX803: v_perm_b32 [[RES:v[0-9]+]], [[HI]], [[LO]], [[MASK]]
499; GFX803: flat_store_dword v[0:1], [[RES]]
500
501; GFX906: flat_load_ubyte
502; GFX906: v_bfi_b32
503define void @load_flat_lo_v2i16_reglo_vreg_zexti8(i8* %in, i32 %reg) #0 {
504entry:
505  %reg.bc = bitcast i32 %reg to <2 x i16>
506  %load = load i8, i8* %in
507  %ext = zext i8 %load to i16
508  %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
509  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
510  ret void
511}
512
513; GCN-LABEL: {{^}}load_flat_lo_v2i16_reglo_vreg_sexti8:
514; GCN: s_waitcnt
515; GFX900-NEXT: flat_load_sbyte_d16 v2, v[0:1]
516; GFX900-NEXT: s_waitcnt
517; GFX900-NEXT: global_store_dword v[0:1], v2
518; GFX900-NEXT: s_waitcnt
519; GFX900-NEXT: s_setpc_b64
520
521; GFX803: flat_load_sbyte v{{[0-9]+}}
522; GFX803: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
523
524; GFX906: flat_load_sbyte
525; GFX906: v_bfi_b32
526define void @load_flat_lo_v2i16_reglo_vreg_sexti8(i8* %in, i32 %reg) #0 {
527entry:
528  %reg.bc = bitcast i32 %reg to <2 x i16>
529  %load = load i8, i8* %in
530  %ext = sext i8 %load to i16
531  %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
532  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
533  ret void
534}
535
536; GCN-LABEL: {{^}}load_flat_lo_v2f16_reglo_vreg_zexti8:
537; GCN: s_waitcnt
538; GFX900-NEXT: flat_load_ubyte_d16 v2, v[0:1]
539; GFX900-NEXT: s_waitcnt
540; GFX900-NEXT: global_store_dword v[0:1], v2
541; GFX900-NEXT: s_waitcnt
542; GFX900-NEXT: s_setpc_b64
543
544; GFX803: flat_load_ubyte [[LO:v[0-9]+]]
545; GFX803: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, v2
546; GFX803: s_mov_b32 [[MASK:s[0-9]+]], 0x5040c00
547; GFX803: v_perm_b32 [[RES:v[0-9]+]], [[HI]], [[LO]], [[MASK]]
548; GFX803: flat_store_dword v[0:1], [[RES]]
549
550; GFX906: flat_load_ubyte
551; GFX906: v_lshrrev_b32
552; GFX906: v_and_b32
553; GFX906: v_lshl_or_b32
554define void @load_flat_lo_v2f16_reglo_vreg_zexti8(i8* %in, i32 %reg) #0 {
555entry:
556  %reg.bc = bitcast i32 %reg to <2 x half>
557  %load = load i8, i8* %in
558  %ext = zext i8 %load to i16
559  %bitcast = bitcast i16 %ext to half
560  %build1 = insertelement <2 x half> %reg.bc, half %bitcast, i32 0
561  store <2 x half> %build1, <2 x half> addrspace(1)* undef
562  ret void
563}
564
565; GCN-LABEL: {{^}}load_flat_lo_v2f16_reglo_vreg_sexti8:
566; GCN: s_waitcnt
567; GFX900-NEXT: flat_load_sbyte_d16 v2, v[0:1]
568; GFX900-NEXT: s_waitcnt
569; GFX900-NEXT: global_store_dword v[0:1], v2
570; GFX900-NEXT: s_waitcnt
571; GFX900-NEXT: s_setpc_b64
572
573; GFX803: flat_load_sbyte v{{[0-9]+}}
574; GFX803: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
575
576; GFX906: flat_load_sbyte
577; GFX906: v_lshrrev_b32
578; GFX906: v_and_b32
579; GFX906: v_lshl_or_b32
580define void @load_flat_lo_v2f16_reglo_vreg_sexti8(i8* %in, i32 %reg) #0 {
581entry:
582  %reg.bc = bitcast i32 %reg to <2 x half>
583  %load = load i8, i8* %in
584  %ext = sext i8 %load to i16
585  %bitcast = bitcast i16 %ext to half
586  %build1 = insertelement <2 x half> %reg.bc, half %bitcast, i32 0
587  store <2 x half> %build1, <2 x half> addrspace(1)* undef
588  ret void
589}
590
591; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg:
592; GCN: s_waitcnt
593; GFX900: buffer_load_short_d16 v0, off, s[0:3], s32 offset:4094{{$}}
594; GFX900-NEXT: s_waitcnt
595; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
596; GFX900-NEXT: s_waitcnt
597; GFX900-NEXT: s_setpc_b64
598
599; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s32 offset:4094{{$}}
600define void @load_private_lo_v2i16_reglo_vreg(i16 addrspace(5)* byval %in, i32 %reg) #0 {
601entry:
602  %reg.bc = bitcast i32 %reg to <2 x i16>
603  %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i64 2047
604  %load = load i16, i16 addrspace(5)* %gep
605  %build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0
606  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
607  ret void
608}
609
610; GCN-LABEL: {{^}}load_private_lo_v2i16_reghi_vreg:
611; GCN: s_waitcnt
612; GFX900: buffer_load_ushort v1, off, s[0:3], s32 offset:4094{{$}}
613; GFX900-NEXT: s_waitcnt
614; GFX900: v_and_b32
615; GFX900: v_lshl_or_b32
616
617; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}
618; GFX900-NEXT: s_waitcnt
619; GFX900-NEXT: s_setpc_b64
620
621; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s32 offset:4094{{$}}
622define void @load_private_lo_v2i16_reghi_vreg(i16 addrspace(5)* byval %in, i16 %reg) #0 {
623entry:
624  %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i64 2047
625  %load = load i16, i16 addrspace(5)* %gep
626  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 1
627  %build1 = insertelement <2 x i16> %build0, i16 %load, i32 0
628  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
629  ret void
630}
631
632; GCN-LABEL: {{^}}load_private_lo_v2f16_reglo_vreg:
633; GCN: s_waitcnt
634; GFX900: buffer_load_short_d16 v0, off, s[0:3], s32 offset:4094{{$}}
635; GFX900-NEXT: s_waitcnt
636; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
637; GFX900-NEXT: s_waitcnt
638; GFX900-NEXT: s_setpc_b64
639
640; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s32 offset:4094{{$}}
641define void @load_private_lo_v2f16_reglo_vreg(half addrspace(5)* byval %in, i32 %reg) #0 {
642entry:
643  %reg.bc = bitcast i32 %reg to <2 x half>
644  %gep = getelementptr inbounds half, half addrspace(5)* %in, i64 2047
645  %load = load half, half addrspace(5)* %gep
646  %build1 = insertelement <2 x half> %reg.bc, half %load, i32 0
647  store <2 x half> %build1, <2 x half> addrspace(1)* undef
648  ret void
649}
650
651; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_nooff:
652; GCN: s_waitcnt
653; GFX900-NEXT: buffer_load_short_d16 v1, off, s[0:3], s33 offset:4094{{$}}
654; GFX900-NEXT: s_waitcnt
655; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
656; GFX900-NEXT: s_waitcnt
657; GFX900-NEXT: s_setpc_b64
658
659; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s33 offset:4094{{$}}
660define void @load_private_lo_v2i16_reglo_vreg_nooff(i16 addrspace(5)* %in, i32 %reg) #0 {
661entry:
662  %reg.bc = bitcast i32 %reg to <2 x i16>
663  %load = load volatile i16, i16 addrspace(5)* inttoptr (i32 4094 to i16 addrspace(5)*)
664  %build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0
665  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
666  ret void
667}
668
669; GCN-LABEL: {{^}}load_private_lo_v2i16_reghi_vreg_nooff:
670; GCN: s_waitcnt
671; GFX900-NEXT: buffer_load_short_d16 v1, off, s[0:3], s33 offset:4094{{$}}
672; GFX900-NEXT: s_waitcnt
673; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
674; GFX900-NEXT: s_waitcnt
675; GFX900-NEXT: s_setpc_b64
676
677; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s33 offset:4094{{$}}
678define void @load_private_lo_v2i16_reghi_vreg_nooff(i16 addrspace(5)* %in, i32 %reg) #0 {
679entry:
680  %reg.bc = bitcast i32 %reg to <2 x i16>
681  %load = load volatile i16, i16 addrspace(5)* inttoptr (i32 4094 to i16 addrspace(5)*)
682  %build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0
683  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
684  ret void
685}
686
687; GCN-LABEL: {{^}}load_private_lo_v2f16_reglo_vreg_nooff:
688; GCN: s_waitcnt
689; GFX900-NEXT: buffer_load_short_d16 v1, off, s[0:3], s33 offset:4094{{$}}
690; GFX900-NEXT: s_waitcnt
691; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
692; GFX900-NEXT: s_waitcnt
693; GFX900-NEXT: s_setpc_b64
694
695; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s33 offset:4094{{$}}
696define void @load_private_lo_v2f16_reglo_vreg_nooff(half addrspace(5)* %in, i32 %reg) #0 {
697entry:
698  %reg.bc = bitcast i32 %reg to <2 x half>
699  %load = load volatile half, half addrspace(5)* inttoptr (i32 4094 to half addrspace(5)*)
700  %build1 = insertelement <2 x half> %reg.bc, half %load, i32 0
701  store <2 x half> %build1, <2 x half> addrspace(1)* undef
702  ret void
703}
704
705; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_zexti8:
706; GCN: s_waitcnt
707; GFX900: buffer_load_ubyte_d16 v0, off, s[0:3], s32 offset:4095{{$}}
708; GFX900-NEXT: s_waitcnt
709; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
710; GFX900-NEXT: s_waitcnt
711; GFX900-NEXT: s_setpc_b64
712
713; NO-D16-HI: buffer_load_ubyte v{{[0-9]+}}, off, s[0:3], s32 offset:4095{{$}}
714define void @load_private_lo_v2i16_reglo_vreg_zexti8(i8 addrspace(5)* byval %in, i32 %reg) #0 {
715entry:
716  %reg.bc = bitcast i32 %reg to <2 x i16>
717  %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095
718  %load = load i8, i8 addrspace(5)* %gep
719  %ext = zext i8 %load to i16
720  %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
721  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
722  ret void
723}
724
725; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_sexti8:
726; GCN: s_waitcnt
727; GFX900: buffer_load_sbyte_d16 v0, off, s[0:3], s32 offset:4095{{$}}
728; GFX900-NEXT: s_waitcnt
729; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
730; GFX900-NEXT: s_waitcnt
731; GFX900-NEXT: s_setpc_b64
732
733; NO-D16-HI: buffer_load_sbyte v{{[0-9]+}}, off, s[0:3], s32 offset:4095{{$}}
734define void @load_private_lo_v2i16_reglo_vreg_sexti8(i8 addrspace(5)* byval %in, i32 %reg) #0 {
735entry:
736  %reg.bc = bitcast i32 %reg to <2 x i16>
737  %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095
738  %load = load i8, i8 addrspace(5)* %gep
739  %ext = sext i8 %load to i16
740  %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
741  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
742  ret void
743}
744
745; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_nooff_zexti8:
746; GCN: s_waitcnt
747; GFX900-NEXT: buffer_load_ubyte_d16 v1, off, s[0:3], s33 offset:4094{{$}}
748; GFX900-NEXT: s_waitcnt
749; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
750; GFX900-NEXT: s_waitcnt
751; GFX900-NEXT: s_setpc_b64
752
753; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], s33 offset:4094{{$}}
754define void @load_private_lo_v2i16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, i32 %reg) #0 {
755entry:
756  %reg.bc = bitcast i32 %reg to <2 x i16>
757  %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*)
758  %ext = zext i8 %load to i16
759  %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
760  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
761  ret void
762}
763
764; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_nooff_sexti8:
765; GCN: s_waitcnt
766; GFX900-NEXT: buffer_load_sbyte_d16 v1, off, s[0:3], s33 offset:4094{{$}}
767; GFX900-NEXT: s_waitcnt
768; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
769; GFX900-NEXT: s_waitcnt
770; GFX900-NEXT: s_setpc_b64
771
772; NO-D16-HI: buffer_load_sbyte v0, off, s[0:3], s33 offset:4094{{$}}
773define void @load_private_lo_v2i16_reglo_vreg_nooff_sexti8(i8 addrspace(5)* %in, i32 %reg) #0 {
774entry:
775  %reg.bc = bitcast i32 %reg to <2 x i16>
776  %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*)
777  %ext = sext i8 %load to i16
778  %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
779  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
780  ret void
781}
782
783; GCN-LABEL: {{^}}load_private_lo_v2f16_reglo_vreg_nooff_zexti8:
784; GCN: s_waitcnt
785; GFX900-NEXT: buffer_load_ubyte_d16 v1, off, s[0:3], s33 offset:4094{{$}}
786; GFX900-NEXT: s_waitcnt
787; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
788; GFX900-NEXT: s_waitcnt
789; GFX900-NEXT: s_setpc_b64
790
791; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], s33 offset:4094{{$}}
792define void @load_private_lo_v2f16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, i32 %reg) #0 {
793entry:
794  %reg.bc = bitcast i32 %reg to <2 x half>
795  %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*)
796  %ext = zext i8 %load to i16
797  %bc.ext = bitcast i16 %ext to half
798  %build1 = insertelement <2 x half> %reg.bc, half %bc.ext, i32 0
799  store <2 x half> %build1, <2 x half> addrspace(1)* undef
800  ret void
801}
802
803; GCN-LABEL: {{^}}load_constant_lo_v2i16_reglo_vreg:
804; GCN: s_waitcnt
805; GFX900-NEXT: global_load_short_d16 v2, v[0:1], off offset:-4094
806; GFX900-NEXT: s_waitcnt
807; GFX900-NEXT: global_store_dword
808; GFX900-NEXT: s_waitcnt
809; GFX900-NEXT: s_setpc_b64
810
811; GFX803: flat_load_ushort
812
813; GFX906: global_load_ushort
814define void @load_constant_lo_v2i16_reglo_vreg(i16 addrspace(4)* %in, i32 %reg) #0 {
815entry:
816  %reg.bc = bitcast i32 %reg to <2 x i16>
817  %gep = getelementptr inbounds i16, i16 addrspace(4)* %in, i64 -2047
818  %load = load i16, i16 addrspace(4)* %gep
819  %build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0
820  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
821  ret void
822}
823
824; GCN-LABEL: load_constant_lo_v2f16_reglo_vreg
825; GCN: s_waitcnt
826; GFX900-NEXT: global_load_short_d16 v2, v[0:1], off offset:-4094
827; GFX900-NEXT: s_waitcnt
828; GFX900-NEXT: global_store_dword
829; GFX900-NEXT: s_waitcnt
830; GFX900-NEXT: s_setpc_b64
831
832; GFX803: flat_load_ushort
833
834; GFX906: global_load_ushort
835define void @load_constant_lo_v2f16_reglo_vreg(half addrspace(4)* %in, i32 %reg) #0 {
836entry:
837  %reg.bc = bitcast i32 %reg to <2 x half>
838  %gep = getelementptr inbounds half, half addrspace(4)* %in, i64 -2047
839  %load = load half, half addrspace(4)* %gep
840  %build1 = insertelement <2 x half> %reg.bc, half %load, i32 0
841  store <2 x half> %build1, <2 x half> addrspace(1)* undef
842  ret void
843}
844
845; GCN-LABEL: {{^}}load_constant_lo_v2f16_reglo_vreg_zexti8:
846; GCN: s_waitcnt
847; GFX900-NEXT: global_load_ubyte_d16 v2, v[0:1], off offset:-4095
848; GFX900-NEXT: s_waitcnt
849; GFX900-NEXT: global_store_dword
850; GFX900-NEXT: s_waitcnt
851; GFX900-NEXT: s_setpc_b64
852
853; GFX906: global_load_ubyte v0, v[0:1], off offset:-4095
854; GFX906: v_and_b32_e32
855; GFX906: v_lshl_or_b32
856
857; GFX803: flat_load_ubyte
858define void @load_constant_lo_v2f16_reglo_vreg_zexti8(i8 addrspace(4)* %in, i32 %reg) #0 {
859entry:
860  %reg.bc = bitcast i32 %reg to <2 x half>
861  %gep = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 -4095
862  %load = load i8, i8 addrspace(4)* %gep
863  %ext = zext i8 %load to i16
864  %bitcast = bitcast i16 %ext to half
865  %build1 = insertelement <2 x half> %reg.bc, half %bitcast, i32 0
866  store <2 x half> %build1, <2 x half> addrspace(1)* undef
867  ret void
868}
869
870; GCN-LABEL: {{^}}load_constant_lo_v2f16_reglo_vreg_sexti8:
871; GCN: s_waitcnt
872; GFX900-NEXT: global_load_sbyte_d16 v2, v[0:1], off offset:-4095
873; GFX900-NEXT: s_waitcnt
874; GFX900-NEXT: global_store_dword
875; GFX900-NEXT: s_waitcnt
876; GFX900-NEXT: s_setpc_b64
877
878; GFX906: global_load_sbyte v0, v[0:1], off offset:-4095
879; GFX906: v_lshrrev_b32
880; GFX906: v_and_b32
881; GFX906: v_lshl_or_b32
882
883; GFX803: flat_load_sbyte
884define void @load_constant_lo_v2f16_reglo_vreg_sexti8(i8 addrspace(4)* %in, i32 %reg) #0 {
885entry:
886  %reg.bc = bitcast i32 %reg to <2 x half>
887  %gep = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 -4095
888  %load = load i8, i8 addrspace(4)* %gep
889  %ext = sext i8 %load to i16
890  %bitcast = bitcast i16 %ext to half
891  %build1 = insertelement <2 x half> %reg.bc, half %bitcast, i32 0
892  store <2 x half> %build1, <2 x half> addrspace(1)* undef
893  ret void
894}
895
896; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_to_offset:
897; GFX900: buffer_store_dword
898; GFX900-NEXT: buffer_load_short_d16 v0, off, s[0:3], s32 offset:4094
899
900; NO-D16-HI: buffer_load_ushort v
901define void @load_private_lo_v2i16_reglo_vreg_to_offset(i32 %reg) #0 {
902entry:
903  %obj0 = alloca [10 x i32], align 4, addrspace(5)
904  %obj1 = alloca [4096 x i16], align 2, addrspace(5)
905  %reg.bc = bitcast i32 %reg to <2 x i16>
906  %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
907  store volatile i32 123, i32 addrspace(5)* %bc
908  %gep = getelementptr inbounds [4096 x i16], [4096 x i16] addrspace(5)* %obj1, i32 0, i32 2027
909  %load = load volatile i16, i16 addrspace(5)* %gep
910  %build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0
911  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
912  ret void
913}
914
915; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_sexti8_to_offset:
916; GFX900: buffer_store_dword
917; GFX900-NEXT: buffer_load_sbyte_d16 v0, off, s[0:3], s32 offset:4095
918
919; NO-D16-HI: buffer_load_sbyte v
920define void @load_private_lo_v2i16_reglo_vreg_sexti8_to_offset(i32 %reg) #0 {
921entry:
922  %obj0 = alloca [10 x i32], align 4, addrspace(5)
923  %obj1 = alloca [4096 x i8], align 2, addrspace(5)
924  %reg.bc = bitcast i32 %reg to <2 x i16>
925  %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
926  store volatile i32 123, i32 addrspace(5)* %bc
927  %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4055
928  %load = load volatile i8, i8 addrspace(5)* %gep
929  %load.ext = sext i8 %load to i16
930  %build1 = insertelement <2 x i16> %reg.bc, i16 %load.ext, i32 0
931  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
932  ret void
933}
934
935; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_zexti8_to_offset:
936; GFX900: buffer_store_dword
937; GFX900-NEXT: buffer_load_ubyte_d16 v0, off, s[0:3], s32 offset:4095
938
939; NO-D16-HI: buffer_load_ubyte v
940define void @load_private_lo_v2i16_reglo_vreg_zexti8_to_offset(i32 %reg) #0 {
941entry:
942  %obj0 = alloca [10 x i32], align 4, addrspace(5)
943  %obj1 = alloca [4096 x i8], align 2, addrspace(5)
944  %reg.bc = bitcast i32 %reg to <2 x i16>
945  %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
946  store volatile i32 123, i32 addrspace(5)* %bc
947  %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4055
948  %load = load volatile i8, i8 addrspace(5)* %gep
949  %load.ext = zext i8 %load to i16
950  %build1 = insertelement <2 x i16> %reg.bc, i16 %load.ext, i32 0
951  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
952  ret void
953}
954
955; GCN-LABEL: {{^}}load_private_lo_v2f16_reglo_vreg_sexti8_to_offset:
956; GFX900: buffer_store_dword
957; GFX900-NEXT: buffer_load_sbyte_d16 v0, off, s[0:3], s32 offset:4095
958
959; NO-D16-HI: buffer_load_sbyte v
960define void @load_private_lo_v2f16_reglo_vreg_sexti8_to_offset(i32 %reg) #0 {
961entry:
962  %obj0 = alloca [10 x i32], align 4, addrspace(5)
963  %obj1 = alloca [4096 x i8], align 2, addrspace(5)
964  %reg.bc = bitcast i32 %reg to <2 x half>
965  %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
966  store volatile i32 123, i32 addrspace(5)* %bc
967  %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4055
968  %load = load volatile i8, i8 addrspace(5)* %gep
969  %load.ext = sext i8 %load to i16
970  %bitcast = bitcast i16 %load.ext to half
971  %build1 = insertelement <2 x half> %reg.bc, half %bitcast, i32 0
972  store <2 x half> %build1, <2 x half> addrspace(1)* undef
973  ret void
974}
975
976; GCN-LABEL: {{^}}load_private_lo_v2f16_reglo_vreg_zexti8_to_offset:
977; GFX900: buffer_store_dword
978; GFX900-NEXT: buffer_load_ubyte_d16 v0, off, s[0:3], s32 offset:4095
979
980; NO-D16-HI: buffer_load_ubyte v
981define void @load_private_lo_v2f16_reglo_vreg_zexti8_to_offset(i32 %reg) #0 {
982entry:
983  %obj0 = alloca [10 x i32], align 4, addrspace(5)
984  %obj1 = alloca [4096 x i8], align 2, addrspace(5)
985  %reg.bc = bitcast i32 %reg to <2 x half>
986  %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
987  store volatile i32 123, i32 addrspace(5)* %bc
988  %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4055
989  %load = load volatile i8, i8 addrspace(5)* %gep
990  %load.ext = zext i8 %load to i16
991  %bitcast = bitcast i16 %load.ext to half
992  %build1 = insertelement <2 x half> %reg.bc, half %bitcast, i32 0
993  store <2 x half> %build1, <2 x half> addrspace(1)* undef
994  ret void
995}
996
997attributes #0 = { nounwind }
998