1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX900,GFX900-MUBUF %s
3; RUN: llc -march=amdgcn -mcpu=gfx906 -amdgpu-sroa=0 -mattr=-promote-alloca,+sram-ecc -verify-machineinstrs < %s | FileCheck --check-prefix=GFX906 %s
4; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck --check-prefix=GFX803 %s
5; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs --amdgpu-enable-flat-scratch < %s | FileCheck -check-prefixes=GFX900,GFX900-FLATSCR %s
6
7define <2 x i16> @load_local_lo_v2i16_undeflo(i16 addrspace(3)* %in) #0 {
8; GFX900-LABEL: load_local_lo_v2i16_undeflo:
9; GFX900:       ; %bb.0: ; %entry
10; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11; GFX900-NEXT:    ds_read_u16_d16 v0, v0
12; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
13; GFX900-NEXT:    s_setpc_b64 s[30:31]
14;
15; GFX906-LABEL: load_local_lo_v2i16_undeflo:
16; GFX906:       ; %bb.0: ; %entry
17; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18; GFX906-NEXT:    ds_read_u16 v0, v0
19; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
20; GFX906-NEXT:    s_setpc_b64 s[30:31]
21;
22; GFX803-LABEL: load_local_lo_v2i16_undeflo:
23; GFX803:       ; %bb.0: ; %entry
24; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25; GFX803-NEXT:    s_mov_b32 m0, -1
26; GFX803-NEXT:    ds_read_u16 v0, v0
27; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
28; GFX803-NEXT:    s_setpc_b64 s[30:31]
29entry:
30  %load = load i16, i16 addrspace(3)* %in
31  %build = insertelement <2 x i16> undef, i16 %load, i32 0
32  ret <2 x i16> %build
33}
34
35define <2 x i16> @load_local_lo_v2i16_reglo(i16 addrspace(3)* %in, i16 %reg) #0 {
36; GFX900-LABEL: load_local_lo_v2i16_reglo:
37; GFX900:       ; %bb.0: ; %entry
38; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
39; GFX900-NEXT:    ds_read_u16 v0, v0
40; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
41; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff, v0
42; GFX900-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
43; GFX900-NEXT:    s_setpc_b64 s[30:31]
44;
45; GFX906-LABEL: load_local_lo_v2i16_reglo:
46; GFX906:       ; %bb.0: ; %entry
47; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
48; GFX906-NEXT:    ds_read_u16 v0, v0
49; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
50; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
51; GFX906-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
52; GFX906-NEXT:    s_setpc_b64 s[30:31]
53;
54; GFX803-LABEL: load_local_lo_v2i16_reglo:
55; GFX803:       ; %bb.0: ; %entry
56; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
57; GFX803-NEXT:    s_mov_b32 m0, -1
58; GFX803-NEXT:    ds_read_u16 v0, v0
59; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
60; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
61; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
62; GFX803-NEXT:    s_setpc_b64 s[30:31]
63entry:
64  %load = load i16, i16 addrspace(3)* %in
65  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 1
66  %build1 = insertelement <2 x i16> %build0, i16 %load, i32 0
67  ret <2 x i16> %build1
68}
69
70; Show that we get reasonable regalloc without physreg constraints.
71define void @load_local_lo_v2i16_reglo_vreg(i16 addrspace(3)* %in, i16 %reg) #0 {
72; GFX900-LABEL: load_local_lo_v2i16_reglo_vreg:
73; GFX900:       ; %bb.0: ; %entry
74; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
75; GFX900-NEXT:    ds_read_u16 v0, v0
76; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
77; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff, v0
78; GFX900-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
79; GFX900-NEXT:    global_store_dword v[0:1], v0, off
80; GFX900-NEXT:    s_waitcnt vmcnt(0)
81; GFX900-NEXT:    s_setpc_b64 s[30:31]
82;
83; GFX906-LABEL: load_local_lo_v2i16_reglo_vreg:
84; GFX906:       ; %bb.0: ; %entry
85; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
86; GFX906-NEXT:    ds_read_u16 v0, v0
87; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
88; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
89; GFX906-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
90; GFX906-NEXT:    global_store_dword v[0:1], v0, off
91; GFX906-NEXT:    s_waitcnt vmcnt(0)
92; GFX906-NEXT:    s_setpc_b64 s[30:31]
93;
94; GFX803-LABEL: load_local_lo_v2i16_reglo_vreg:
95; GFX803:       ; %bb.0: ; %entry
96; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
97; GFX803-NEXT:    s_mov_b32 m0, -1
98; GFX803-NEXT:    ds_read_u16 v0, v0
99; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
100; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
101; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
102; GFX803-NEXT:    flat_store_dword v[0:1], v0
103; GFX803-NEXT:    s_waitcnt vmcnt(0)
104; GFX803-NEXT:    s_setpc_b64 s[30:31]
105entry:
106  %load = load i16, i16 addrspace(3)* %in
107  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 1
108  %build1 = insertelement <2 x i16> %build0, i16 %load, i32 0
109  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
110  ret void
111}
112
113define <2 x i16> @load_local_lo_v2i16_zerolo(i16 addrspace(3)* %in) #0 {
114; GFX900-LABEL: load_local_lo_v2i16_zerolo:
115; GFX900:       ; %bb.0: ; %entry
116; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
117; GFX900-NEXT:    v_mov_b32_e32 v1, 0
118; GFX900-NEXT:    ds_read_u16_d16 v1, v0
119; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
120; GFX900-NEXT:    v_mov_b32_e32 v0, v1
121; GFX900-NEXT:    s_setpc_b64 s[30:31]
122;
123; GFX906-LABEL: load_local_lo_v2i16_zerolo:
124; GFX906:       ; %bb.0: ; %entry
125; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
126; GFX906-NEXT:    ds_read_u16 v0, v0
127; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
128; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
129; GFX906-NEXT:    s_setpc_b64 s[30:31]
130;
131; GFX803-LABEL: load_local_lo_v2i16_zerolo:
132; GFX803:       ; %bb.0: ; %entry
133; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
134; GFX803-NEXT:    s_mov_b32 m0, -1
135; GFX803-NEXT:    ds_read_u16 v0, v0
136; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
137; GFX803-NEXT:    s_setpc_b64 s[30:31]
138entry:
139  %load = load i16, i16 addrspace(3)* %in
140  %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 0
141  ret <2 x i16> %build
142}
143
144define <2 x half> @load_local_lo_v2f16_fpimm(half addrspace(3)* %in) #0 {
145; GFX900-LABEL: load_local_lo_v2f16_fpimm:
146; GFX900:       ; %bb.0: ; %entry
147; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
148; GFX900-NEXT:    v_mov_b32_e32 v1, 2.0
149; GFX900-NEXT:    ds_read_u16_d16 v1, v0
150; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
151; GFX900-NEXT:    v_mov_b32_e32 v0, v1
152; GFX900-NEXT:    s_setpc_b64 s[30:31]
153;
154; GFX906-LABEL: load_local_lo_v2f16_fpimm:
155; GFX906:       ; %bb.0: ; %entry
156; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
157; GFX906-NEXT:    ds_read_u16 v0, v0
158; GFX906-NEXT:    s_movk_i32 s4, 0x4000
159; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
160; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
161; GFX906-NEXT:    v_lshl_or_b32 v0, s4, 16, v0
162; GFX906-NEXT:    s_setpc_b64 s[30:31]
163;
164; GFX803-LABEL: load_local_lo_v2f16_fpimm:
165; GFX803:       ; %bb.0: ; %entry
166; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
167; GFX803-NEXT:    s_mov_b32 m0, -1
168; GFX803-NEXT:    ds_read_u16 v0, v0
169; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
170; GFX803-NEXT:    v_or_b32_e32 v0, 2.0, v0
171; GFX803-NEXT:    s_setpc_b64 s[30:31]
172entry:
173  %load = load half, half addrspace(3)* %in
174  %build = insertelement <2 x half> <half 0.0, half 2.0>, half %load, i32 0
175  ret <2 x half> %build
176}
177
178define void @load_local_lo_v2f16_reghi_vreg(half addrspace(3)* %in, i32 %reg) #0 {
179; GFX900-LABEL: load_local_lo_v2f16_reghi_vreg:
180; GFX900:       ; %bb.0: ; %entry
181; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
182; GFX900-NEXT:    ds_read_u16_d16 v1, v0
183; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
184; GFX900-NEXT:    global_store_dword v[0:1], v1, off
185; GFX900-NEXT:    s_waitcnt vmcnt(0)
186; GFX900-NEXT:    s_setpc_b64 s[30:31]
187;
188; GFX906-LABEL: load_local_lo_v2f16_reghi_vreg:
189; GFX906:       ; %bb.0: ; %entry
190; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
191; GFX906-NEXT:    ds_read_u16 v0, v0
192; GFX906-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
193; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
194; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
195; GFX906-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
196; GFX906-NEXT:    global_store_dword v[0:1], v0, off
197; GFX906-NEXT:    s_waitcnt vmcnt(0)
198; GFX906-NEXT:    s_setpc_b64 s[30:31]
199;
200; GFX803-LABEL: load_local_lo_v2f16_reghi_vreg:
201; GFX803:       ; %bb.0: ; %entry
202; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
203; GFX803-NEXT:    s_mov_b32 m0, -1
204; GFX803-NEXT:    ds_read_u16 v0, v0
205; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
206; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
207; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
208; GFX803-NEXT:    flat_store_dword v[0:1], v0
209; GFX803-NEXT:    s_waitcnt vmcnt(0)
210; GFX803-NEXT:    s_setpc_b64 s[30:31]
211entry:
212  %reg.bc = bitcast i32 %reg to <2 x half>
213  %load = load half, half addrspace(3)* %in
214  %build1 = insertelement <2 x half> %reg.bc, half %load, i32 0
215  store <2 x half> %build1, <2 x half> addrspace(1)* undef
216  ret void
217}
218
219define void @load_local_lo_v2f16_reglo_vreg(half addrspace(3)* %in, half %reg) #0 {
220; GFX900-LABEL: load_local_lo_v2f16_reglo_vreg:
221; GFX900:       ; %bb.0: ; %entry
222; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
223; GFX900-NEXT:    ds_read_u16 v0, v0
224; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
225; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff, v0
226; GFX900-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
227; GFX900-NEXT:    global_store_dword v[0:1], v0, off
228; GFX900-NEXT:    s_waitcnt vmcnt(0)
229; GFX900-NEXT:    s_setpc_b64 s[30:31]
230;
231; GFX906-LABEL: load_local_lo_v2f16_reglo_vreg:
232; GFX906:       ; %bb.0: ; %entry
233; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
234; GFX906-NEXT:    ds_read_u16 v0, v0
235; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
236; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
237; GFX906-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
238; GFX906-NEXT:    global_store_dword v[0:1], v0, off
239; GFX906-NEXT:    s_waitcnt vmcnt(0)
240; GFX906-NEXT:    s_setpc_b64 s[30:31]
241;
242; GFX803-LABEL: load_local_lo_v2f16_reglo_vreg:
243; GFX803:       ; %bb.0: ; %entry
244; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
245; GFX803-NEXT:    s_mov_b32 m0, -1
246; GFX803-NEXT:    ds_read_u16 v0, v0
247; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
248; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
249; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
250; GFX803-NEXT:    flat_store_dword v[0:1], v0
251; GFX803-NEXT:    s_waitcnt vmcnt(0)
252; GFX803-NEXT:    s_setpc_b64 s[30:31]
253entry:
254  %load = load half, half addrspace(3)* %in
255  %build0 = insertelement <2 x half> undef, half %reg, i32 1
256  %build1 = insertelement <2 x half> %build0, half %load, i32 0
257  store <2 x half> %build1, <2 x half> addrspace(1)* undef
258  ret void
259}
260
261define void @load_local_lo_v2i16_reghi_vreg_zexti8(i8 addrspace(3)* %in, i32 %reg) #0 {
262; GFX900-LABEL: load_local_lo_v2i16_reghi_vreg_zexti8:
263; GFX900:       ; %bb.0: ; %entry
264; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
265; GFX900-NEXT:    ds_read_u8_d16 v1, v0
266; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
267; GFX900-NEXT:    global_store_dword v[0:1], v1, off
268; GFX900-NEXT:    s_waitcnt vmcnt(0)
269; GFX900-NEXT:    s_setpc_b64 s[30:31]
270;
271; GFX906-LABEL: load_local_lo_v2i16_reghi_vreg_zexti8:
272; GFX906:       ; %bb.0: ; %entry
273; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
274; GFX906-NEXT:    ds_read_u8 v0, v0
275; GFX906-NEXT:    v_mov_b32_e32 v2, 0xffff
276; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
277; GFX906-NEXT:    v_bfi_b32 v0, v2, v0, v1
278; GFX906-NEXT:    global_store_dword v[0:1], v0, off
279; GFX906-NEXT:    s_waitcnt vmcnt(0)
280; GFX906-NEXT:    s_setpc_b64 s[30:31]
281;
282; GFX803-LABEL: load_local_lo_v2i16_reghi_vreg_zexti8:
283; GFX803:       ; %bb.0: ; %entry
284; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
285; GFX803-NEXT:    s_mov_b32 m0, -1
286; GFX803-NEXT:    ds_read_u8 v0, v0
287; GFX803-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
288; GFX803-NEXT:    s_mov_b32 s4, 0x5040c00
289; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
290; GFX803-NEXT:    v_perm_b32 v0, v1, v0, s4
291; GFX803-NEXT:    flat_store_dword v[0:1], v0
292; GFX803-NEXT:    s_waitcnt vmcnt(0)
293; GFX803-NEXT:    s_setpc_b64 s[30:31]
294entry:
295  %reg.bc = bitcast i32 %reg to <2 x i16>
296  %load = load i8, i8 addrspace(3)* %in
297  %ext = zext i8 %load to i16
298  %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
299  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
300  ret void
301}
302
303define void @load_local_lo_v2i16_reglo_vreg_zexti8(i8 addrspace(3)* %in, i16 %reg) #0 {
304; GFX900-LABEL: load_local_lo_v2i16_reglo_vreg_zexti8:
305; GFX900:       ; %bb.0: ; %entry
306; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
307; GFX900-NEXT:    ds_read_u8 v0, v0
308; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
309; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff, v0
310; GFX900-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
311; GFX900-NEXT:    global_store_dword v[0:1], v0, off
312; GFX900-NEXT:    s_waitcnt vmcnt(0)
313; GFX900-NEXT:    s_setpc_b64 s[30:31]
314;
315; GFX906-LABEL: load_local_lo_v2i16_reglo_vreg_zexti8:
316; GFX906:       ; %bb.0: ; %entry
317; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
318; GFX906-NEXT:    ds_read_u8 v0, v0
319; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
320; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
321; GFX906-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
322; GFX906-NEXT:    global_store_dword v[0:1], v0, off
323; GFX906-NEXT:    s_waitcnt vmcnt(0)
324; GFX906-NEXT:    s_setpc_b64 s[30:31]
325;
326; GFX803-LABEL: load_local_lo_v2i16_reglo_vreg_zexti8:
327; GFX803:       ; %bb.0: ; %entry
328; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
329; GFX803-NEXT:    s_mov_b32 m0, -1
330; GFX803-NEXT:    ds_read_u8 v0, v0
331; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
332; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
333; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
334; GFX803-NEXT:    flat_store_dword v[0:1], v0
335; GFX803-NEXT:    s_waitcnt vmcnt(0)
336; GFX803-NEXT:    s_setpc_b64 s[30:31]
337entry:
338  %load = load i8, i8 addrspace(3)* %in
339  %ext = zext i8 %load to i16
340  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 1
341  %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 0
342  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
343  ret void
344}
345
346define void @load_local_lo_v2i16_reghi_vreg_sexti8(i8 addrspace(3)* %in, i32 %reg) #0 {
347; GFX900-LABEL: load_local_lo_v2i16_reghi_vreg_sexti8:
348; GFX900:       ; %bb.0: ; %entry
349; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
350; GFX900-NEXT:    ds_read_i8_d16 v1, v0
351; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
352; GFX900-NEXT:    global_store_dword v[0:1], v1, off
353; GFX900-NEXT:    s_waitcnt vmcnt(0)
354; GFX900-NEXT:    s_setpc_b64 s[30:31]
355;
356; GFX906-LABEL: load_local_lo_v2i16_reghi_vreg_sexti8:
357; GFX906:       ; %bb.0: ; %entry
358; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
359; GFX906-NEXT:    ds_read_i8 v0, v0
360; GFX906-NEXT:    v_mov_b32_e32 v2, 0xffff
361; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
362; GFX906-NEXT:    v_bfi_b32 v0, v2, v0, v1
363; GFX906-NEXT:    global_store_dword v[0:1], v0, off
364; GFX906-NEXT:    s_waitcnt vmcnt(0)
365; GFX906-NEXT:    s_setpc_b64 s[30:31]
366;
367; GFX803-LABEL: load_local_lo_v2i16_reghi_vreg_sexti8:
368; GFX803:       ; %bb.0: ; %entry
369; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
370; GFX803-NEXT:    s_mov_b32 m0, -1
371; GFX803-NEXT:    ds_read_i8 v0, v0
372; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
373; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
374; GFX803-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
375; GFX803-NEXT:    flat_store_dword v[0:1], v0
376; GFX803-NEXT:    s_waitcnt vmcnt(0)
377; GFX803-NEXT:    s_setpc_b64 s[30:31]
378entry:
379  %reg.bc = bitcast i32 %reg to <2 x i16>
380  %load = load i8, i8 addrspace(3)* %in
381  %ext = sext i8 %load to i16
382  %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
383  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
384  ret void
385}
386
387define void @load_local_lo_v2i16_reglo_vreg_sexti8(i8 addrspace(3)* %in, i16 %reg) #0 {
388; GFX900-LABEL: load_local_lo_v2i16_reglo_vreg_sexti8:
389; GFX900:       ; %bb.0: ; %entry
390; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
391; GFX900-NEXT:    ds_read_i8 v0, v0
392; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
393; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff, v0
394; GFX900-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
395; GFX900-NEXT:    global_store_dword v[0:1], v0, off
396; GFX900-NEXT:    s_waitcnt vmcnt(0)
397; GFX900-NEXT:    s_setpc_b64 s[30:31]
398;
399; GFX906-LABEL: load_local_lo_v2i16_reglo_vreg_sexti8:
400; GFX906:       ; %bb.0: ; %entry
401; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
402; GFX906-NEXT:    ds_read_i8 v0, v0
403; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
404; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
405; GFX906-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
406; GFX906-NEXT:    global_store_dword v[0:1], v0, off
407; GFX906-NEXT:    s_waitcnt vmcnt(0)
408; GFX906-NEXT:    s_setpc_b64 s[30:31]
409;
410; GFX803-LABEL: load_local_lo_v2i16_reglo_vreg_sexti8:
411; GFX803:       ; %bb.0: ; %entry
412; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
413; GFX803-NEXT:    s_mov_b32 m0, -1
414; GFX803-NEXT:    ds_read_i8 v0, v0
415; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
416; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
417; GFX803-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
418; GFX803-NEXT:    flat_store_dword v[0:1], v0
419; GFX803-NEXT:    s_waitcnt vmcnt(0)
420; GFX803-NEXT:    s_setpc_b64 s[30:31]
421entry:
422  %load = load i8, i8 addrspace(3)* %in
423  %ext = sext i8 %load to i16
424  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 1
425  %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 0
426  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
427  ret void
428}
429
430define void @load_local_lo_v2f16_reglo_vreg_zexti8(i8 addrspace(3)* %in, half %reg) #0 {
431; GFX900-LABEL: load_local_lo_v2f16_reglo_vreg_zexti8:
432; GFX900:       ; %bb.0: ; %entry
433; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
434; GFX900-NEXT:    ds_read_u8 v0, v0
435; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
436; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff, v0
437; GFX900-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
438; GFX900-NEXT:    global_store_dword v[0:1], v0, off
439; GFX900-NEXT:    s_waitcnt vmcnt(0)
440; GFX900-NEXT:    s_setpc_b64 s[30:31]
441;
442; GFX906-LABEL: load_local_lo_v2f16_reglo_vreg_zexti8:
443; GFX906:       ; %bb.0: ; %entry
444; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
445; GFX906-NEXT:    ds_read_u8 v0, v0
446; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
447; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
448; GFX906-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
449; GFX906-NEXT:    global_store_dword v[0:1], v0, off
450; GFX906-NEXT:    s_waitcnt vmcnt(0)
451; GFX906-NEXT:    s_setpc_b64 s[30:31]
452;
453; GFX803-LABEL: load_local_lo_v2f16_reglo_vreg_zexti8:
454; GFX803:       ; %bb.0: ; %entry
455; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
456; GFX803-NEXT:    s_mov_b32 m0, -1
457; GFX803-NEXT:    ds_read_u8 v0, v0
458; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
459; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
460; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
461; GFX803-NEXT:    flat_store_dword v[0:1], v0
462; GFX803-NEXT:    s_waitcnt vmcnt(0)
463; GFX803-NEXT:    s_setpc_b64 s[30:31]
464entry:
465  %load = load i8, i8 addrspace(3)* %in
466  %ext = zext i8 %load to i16
467  %bitcast = bitcast i16 %ext to half
468  %build0 = insertelement <2 x half> undef, half %reg, i32 1
469  %build1 = insertelement <2 x half> %build0, half %bitcast, i32 0
470  store <2 x half> %build1, <2 x half> addrspace(1)* undef
471  ret void
472}
473
474define void @load_local_lo_v2f16_reglo_vreg_sexti8(i8 addrspace(3)* %in, half %reg) #0 {
475; GFX900-LABEL: load_local_lo_v2f16_reglo_vreg_sexti8:
476; GFX900:       ; %bb.0: ; %entry
477; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
478; GFX900-NEXT:    ds_read_i8 v0, v0
479; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
480; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff, v0
481; GFX900-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
482; GFX900-NEXT:    global_store_dword v[0:1], v0, off
483; GFX900-NEXT:    s_waitcnt vmcnt(0)
484; GFX900-NEXT:    s_setpc_b64 s[30:31]
485;
486; GFX906-LABEL: load_local_lo_v2f16_reglo_vreg_sexti8:
487; GFX906:       ; %bb.0: ; %entry
488; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
489; GFX906-NEXT:    ds_read_i8 v0, v0
490; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
491; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
492; GFX906-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
493; GFX906-NEXT:    global_store_dword v[0:1], v0, off
494; GFX906-NEXT:    s_waitcnt vmcnt(0)
495; GFX906-NEXT:    s_setpc_b64 s[30:31]
496;
497; GFX803-LABEL: load_local_lo_v2f16_reglo_vreg_sexti8:
498; GFX803:       ; %bb.0: ; %entry
499; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
500; GFX803-NEXT:    s_mov_b32 m0, -1
501; GFX803-NEXT:    ds_read_i8 v0, v0
502; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
503; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
504; GFX803-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
505; GFX803-NEXT:    flat_store_dword v[0:1], v0
506; GFX803-NEXT:    s_waitcnt vmcnt(0)
507; GFX803-NEXT:    s_setpc_b64 s[30:31]
508entry:
509  %load = load i8, i8 addrspace(3)* %in
510  %ext = sext i8 %load to i16
511  %bitcast = bitcast i16 %ext to half
512  %build0 = insertelement <2 x half> undef, half %reg, i32 1
513  %build1 = insertelement <2 x half> %build0, half %bitcast, i32 0
514  store <2 x half> %build1, <2 x half> addrspace(1)* undef
515  ret void
516}
517
518define void @load_local_lo_v2i16_reghi_vreg_multi_use_lo(i16 addrspace(3)* %in, <2 x i16> %reg) #0 {
519; GFX900-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_lo:
520; GFX900:       ; %bb.0: ; %entry
521; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
522; GFX900-NEXT:    ds_read_u16 v0, v0
523; GFX900-NEXT:    v_mov_b32_e32 v2, 0
524; GFX900-NEXT:    v_mov_b32_e32 v3, 0xffff
525; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
526; GFX900-NEXT:    ds_write_b16 v2, v0
527; GFX900-NEXT:    v_bfi_b32 v0, v3, v0, v1
528; GFX900-NEXT:    global_store_dword v[0:1], v0, off
529; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
530; GFX900-NEXT:    s_setpc_b64 s[30:31]
531;
532; GFX906-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_lo:
533; GFX906:       ; %bb.0: ; %entry
534; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
535; GFX906-NEXT:    ds_read_u16 v0, v0
536; GFX906-NEXT:    v_mov_b32_e32 v2, 0
537; GFX906-NEXT:    v_mov_b32_e32 v3, 0xffff
538; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
539; GFX906-NEXT:    ds_write_b16 v2, v0
540; GFX906-NEXT:    v_bfi_b32 v0, v3, v0, v1
541; GFX906-NEXT:    global_store_dword v[0:1], v0, off
542; GFX906-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
543; GFX906-NEXT:    s_setpc_b64 s[30:31]
544;
545; GFX803-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_lo:
546; GFX803:       ; %bb.0: ; %entry
547; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
548; GFX803-NEXT:    s_mov_b32 m0, -1
549; GFX803-NEXT:    ds_read_u16 v0, v0
550; GFX803-NEXT:    v_mov_b32_e32 v2, 0
551; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
552; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
553; GFX803-NEXT:    ds_write_b16 v2, v0
554; GFX803-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
555; GFX803-NEXT:    flat_store_dword v[0:1], v0
556; GFX803-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
557; GFX803-NEXT:    s_setpc_b64 s[30:31]
558entry:
559  %load = load i16, i16 addrspace(3)* %in
560  %elt1 = extractelement <2 x i16> %reg, i32 1
561  store i16 %load, i16 addrspace(3)* null
562  %build1 = insertelement <2 x i16> %reg, i16 %load, i32 0
563  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
564  ret void
565}
566
567define void @load_local_lo_v2i16_reghi_vreg_multi_use_hi(i16 addrspace(3)* %in, <2 x i16> %reg) #0 {
568; GFX900-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_hi:
569; GFX900:       ; %bb.0: ; %entry
570; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
571; GFX900-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
572; GFX900-NEXT:    ds_read_u16_d16 v1, v0
573; GFX900-NEXT:    v_mov_b32_e32 v0, 0
574; GFX900-NEXT:    ds_write_b16 v0, v2
575; GFX900-NEXT:    s_waitcnt lgkmcnt(1)
576; GFX900-NEXT:    global_store_dword v[0:1], v1, off
577; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
578; GFX900-NEXT:    s_setpc_b64 s[30:31]
579;
580; GFX906-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_hi:
581; GFX906:       ; %bb.0: ; %entry
582; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
583; GFX906-NEXT:    ds_read_u16 v0, v0
584; GFX906-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
585; GFX906-NEXT:    v_mov_b32_e32 v3, 0
586; GFX906-NEXT:    ds_write_b16 v3, v2
587; GFX906-NEXT:    v_mov_b32_e32 v2, 0xffff
588; GFX906-NEXT:    s_waitcnt lgkmcnt(1)
589; GFX906-NEXT:    v_bfi_b32 v0, v2, v0, v1
590; GFX906-NEXT:    global_store_dword v[0:1], v0, off
591; GFX906-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
592; GFX906-NEXT:    s_setpc_b64 s[30:31]
593;
594; GFX803-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_hi:
595; GFX803:       ; %bb.0: ; %entry
596; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
597; GFX803-NEXT:    s_mov_b32 m0, -1
598; GFX803-NEXT:    ds_read_u16 v0, v0
599; GFX803-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
600; GFX803-NEXT:    v_mov_b32_e32 v2, 0
601; GFX803-NEXT:    ds_write_b16 v2, v1
602; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
603; GFX803-NEXT:    s_waitcnt lgkmcnt(1)
604; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
605; GFX803-NEXT:    flat_store_dword v[0:1], v0
606; GFX803-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
607; GFX803-NEXT:    s_setpc_b64 s[30:31]
608entry:
609  %load = load i16, i16 addrspace(3)* %in
610  %elt1 = extractelement <2 x i16> %reg, i32 1
611  store i16 %elt1, i16 addrspace(3)* null
612  %build1 = insertelement <2 x i16> %reg, i16 %load, i32 0
613  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
614  ret void
615}
616
617define void @load_local_lo_v2i16_reghi_vreg_multi_use_lohi(i16 addrspace(3)* noalias %in, <2 x i16> %reg, i16 addrspace(3)* noalias %out0, i16 addrspace(3)* noalias %out1) #0 {
618; GFX900-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_lohi:
619; GFX900:       ; %bb.0: ; %entry
620; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
621; GFX900-NEXT:    ds_read_u16 v0, v0
622; GFX900-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
623; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
624; GFX900-NEXT:    ds_write_b16 v2, v0
625; GFX900-NEXT:    ds_write_b16 v3, v4
626; GFX900-NEXT:    v_mov_b32_e32 v2, 0xffff
627; GFX900-NEXT:    v_bfi_b32 v0, v2, v0, v1
628; GFX900-NEXT:    global_store_dword v[0:1], v0, off
629; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
630; GFX900-NEXT:    s_setpc_b64 s[30:31]
631;
632; GFX906-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_lohi:
633; GFX906:       ; %bb.0: ; %entry
634; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
635; GFX906-NEXT:    ds_read_u16 v0, v0
636; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
637; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
638; GFX906-NEXT:    ds_write_b16 v2, v0
639; GFX906-NEXT:    ds_write_b16 v3, v4
640; GFX906-NEXT:    v_mov_b32_e32 v2, 0xffff
641; GFX906-NEXT:    v_bfi_b32 v0, v2, v0, v1
642; GFX906-NEXT:    global_store_dword v[0:1], v0, off
643; GFX906-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
644; GFX906-NEXT:    s_setpc_b64 s[30:31]
645;
646; GFX803-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_lohi:
647; GFX803:       ; %bb.0: ; %entry
648; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
649; GFX803-NEXT:    s_mov_b32 m0, -1
650; GFX803-NEXT:    ds_read_u16 v0, v0
651; GFX803-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
652; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
653; GFX803-NEXT:    ds_write_b16 v2, v0
654; GFX803-NEXT:    ds_write_b16 v3, v1
655; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
656; GFX803-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
657; GFX803-NEXT:    flat_store_dword v[0:1], v0
658; GFX803-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
659; GFX803-NEXT:    s_setpc_b64 s[30:31]
660entry:
661  %load = load i16, i16 addrspace(3)* %in
662  %elt1 = extractelement <2 x i16> %reg, i32 1
663  store i16 %load, i16 addrspace(3)* %out0
664  store i16 %elt1, i16 addrspace(3)* %out1
665  %build1 = insertelement <2 x i16> %reg, i16 %load, i32 0
666  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
667  ret void
668}
669
670define void @load_global_lo_v2i16_reglo_vreg(i16 addrspace(1)* %in, i32 %reg) #0 {
671; GFX900-LABEL: load_global_lo_v2i16_reglo_vreg:
672; GFX900:       ; %bb.0: ; %entry
673; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
674; GFX900-NEXT:    global_load_short_d16 v2, v[0:1], off offset:-4094
675; GFX900-NEXT:    s_waitcnt vmcnt(0)
676; GFX900-NEXT:    global_store_dword v[0:1], v2, off
677; GFX900-NEXT:    s_waitcnt vmcnt(0)
678; GFX900-NEXT:    s_setpc_b64 s[30:31]
679;
680; GFX906-LABEL: load_global_lo_v2i16_reglo_vreg:
681; GFX906:       ; %bb.0: ; %entry
682; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
683; GFX906-NEXT:    global_load_ushort v0, v[0:1], off offset:-4094
684; GFX906-NEXT:    v_mov_b32_e32 v1, 0xffff
685; GFX906-NEXT:    s_waitcnt vmcnt(0)
686; GFX906-NEXT:    v_bfi_b32 v0, v1, v0, v2
687; GFX906-NEXT:    global_store_dword v[0:1], v0, off
688; GFX906-NEXT:    s_waitcnt vmcnt(0)
689; GFX906-NEXT:    s_setpc_b64 s[30:31]
690;
691; GFX803-LABEL: load_global_lo_v2i16_reglo_vreg:
692; GFX803:       ; %bb.0: ; %entry
693; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
694; GFX803-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff002, v0
695; GFX803-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
696; GFX803-NEXT:    flat_load_ushort v0, v[0:1]
697; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
698; GFX803-NEXT:    s_waitcnt vmcnt(0)
699; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
700; GFX803-NEXT:    flat_store_dword v[0:1], v0
701; GFX803-NEXT:    s_waitcnt vmcnt(0)
702; GFX803-NEXT:    s_setpc_b64 s[30:31]
703entry:
704  %reg.bc = bitcast i32 %reg to <2 x i16>
705  %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 -2047
706  %load = load i16, i16 addrspace(1)* %gep
707  %build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0
708  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
709  ret void
710}
711
712define void @load_global_lo_v2f16_reglo_vreg(half addrspace(1)* %in, i32 %reg) #0 {
713; GFX900-LABEL: load_global_lo_v2f16_reglo_vreg:
714; GFX900:       ; %bb.0: ; %entry
715; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
716; GFX900-NEXT:    global_load_short_d16 v2, v[0:1], off offset:-4094
717; GFX900-NEXT:    s_waitcnt vmcnt(0)
718; GFX900-NEXT:    global_store_dword v[0:1], v2, off
719; GFX900-NEXT:    s_waitcnt vmcnt(0)
720; GFX900-NEXT:    s_setpc_b64 s[30:31]
721;
722; GFX906-LABEL: load_global_lo_v2f16_reglo_vreg:
723; GFX906:       ; %bb.0: ; %entry
724; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
725; GFX906-NEXT:    global_load_ushort v0, v[0:1], off offset:-4094
726; GFX906-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
727; GFX906-NEXT:    s_waitcnt vmcnt(0)
728; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
729; GFX906-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
730; GFX906-NEXT:    global_store_dword v[0:1], v0, off
731; GFX906-NEXT:    s_waitcnt vmcnt(0)
732; GFX906-NEXT:    s_setpc_b64 s[30:31]
733;
734; GFX803-LABEL: load_global_lo_v2f16_reglo_vreg:
735; GFX803:       ; %bb.0: ; %entry
736; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
737; GFX803-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff002, v0
738; GFX803-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
739; GFX803-NEXT:    flat_load_ushort v0, v[0:1]
740; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
741; GFX803-NEXT:    s_waitcnt vmcnt(0)
742; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
743; GFX803-NEXT:    flat_store_dword v[0:1], v0
744; GFX803-NEXT:    s_waitcnt vmcnt(0)
745; GFX803-NEXT:    s_setpc_b64 s[30:31]
746entry:
747  %reg.bc = bitcast i32 %reg to <2 x half>
748  %gep = getelementptr inbounds half, half addrspace(1)* %in, i64 -2047
749  %load = load half, half addrspace(1)* %gep
750  %build1 = insertelement <2 x half> %reg.bc, half %load, i32 0
751  store <2 x half> %build1, <2 x half> addrspace(1)* undef
752  ret void
753}
754
755define void @load_global_lo_v2i16_reglo_vreg_zexti8(i8 addrspace(1)* %in, i32 %reg) #0 {
756; GFX900-LABEL: load_global_lo_v2i16_reglo_vreg_zexti8:
757; GFX900:       ; %bb.0: ; %entry
758; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
759; GFX900-NEXT:    global_load_ubyte_d16 v2, v[0:1], off offset:-4095
760; GFX900-NEXT:    s_waitcnt vmcnt(0)
761; GFX900-NEXT:    global_store_dword v[0:1], v2, off
762; GFX900-NEXT:    s_waitcnt vmcnt(0)
763; GFX900-NEXT:    s_setpc_b64 s[30:31]
764;
765; GFX906-LABEL: load_global_lo_v2i16_reglo_vreg_zexti8:
766; GFX906:       ; %bb.0: ; %entry
767; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
768; GFX906-NEXT:    global_load_ubyte v0, v[0:1], off offset:-4095
769; GFX906-NEXT:    v_mov_b32_e32 v1, 0xffff
770; GFX906-NEXT:    s_waitcnt vmcnt(0)
771; GFX906-NEXT:    v_bfi_b32 v0, v1, v0, v2
772; GFX906-NEXT:    global_store_dword v[0:1], v0, off
773; GFX906-NEXT:    s_waitcnt vmcnt(0)
774; GFX906-NEXT:    s_setpc_b64 s[30:31]
775;
776; GFX803-LABEL: load_global_lo_v2i16_reglo_vreg_zexti8:
777; GFX803:       ; %bb.0: ; %entry
778; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
779; GFX803-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff001, v0
780; GFX803-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
781; GFX803-NEXT:    flat_load_ubyte v0, v[0:1]
782; GFX803-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
783; GFX803-NEXT:    s_mov_b32 s4, 0x5040c00
784; GFX803-NEXT:    s_waitcnt vmcnt(0)
785; GFX803-NEXT:    v_perm_b32 v0, v1, v0, s4
786; GFX803-NEXT:    flat_store_dword v[0:1], v0
787; GFX803-NEXT:    s_waitcnt vmcnt(0)
788; GFX803-NEXT:    s_setpc_b64 s[30:31]
789entry:
790  %reg.bc = bitcast i32 %reg to <2 x i16>
791  %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095
792  %load = load i8, i8 addrspace(1)* %gep
793  %ext = zext i8 %load to i16
794  %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
795  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
796  ret void
797}
798
799define void @load_global_lo_v2i16_reglo_vreg_sexti8(i8 addrspace(1)* %in, i32 %reg) #0 {
800; GFX900-LABEL: load_global_lo_v2i16_reglo_vreg_sexti8:
801; GFX900:       ; %bb.0: ; %entry
802; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
803; GFX900-NEXT:    global_load_sbyte_d16 v2, v[0:1], off offset:-4095
804; GFX900-NEXT:    s_waitcnt vmcnt(0)
805; GFX900-NEXT:    global_store_dword v[0:1], v2, off
806; GFX900-NEXT:    s_waitcnt vmcnt(0)
807; GFX900-NEXT:    s_setpc_b64 s[30:31]
808;
809; GFX906-LABEL: load_global_lo_v2i16_reglo_vreg_sexti8:
810; GFX906:       ; %bb.0: ; %entry
811; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
812; GFX906-NEXT:    global_load_sbyte v0, v[0:1], off offset:-4095
813; GFX906-NEXT:    v_mov_b32_e32 v1, 0xffff
814; GFX906-NEXT:    s_waitcnt vmcnt(0)
815; GFX906-NEXT:    v_bfi_b32 v0, v1, v0, v2
816; GFX906-NEXT:    global_store_dword v[0:1], v0, off
817; GFX906-NEXT:    s_waitcnt vmcnt(0)
818; GFX906-NEXT:    s_setpc_b64 s[30:31]
819;
820; GFX803-LABEL: load_global_lo_v2i16_reglo_vreg_sexti8:
821; GFX803:       ; %bb.0: ; %entry
822; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
823; GFX803-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff001, v0
824; GFX803-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
825; GFX803-NEXT:    flat_load_sbyte v0, v[0:1]
826; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
827; GFX803-NEXT:    s_waitcnt vmcnt(0)
828; GFX803-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
829; GFX803-NEXT:    flat_store_dword v[0:1], v0
830; GFX803-NEXT:    s_waitcnt vmcnt(0)
831; GFX803-NEXT:    s_setpc_b64 s[30:31]
832entry:
833  %reg.bc = bitcast i32 %reg to <2 x i16>
834  %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095
835  %load = load i8, i8 addrspace(1)* %gep
836  %ext = sext i8 %load to i16
837  %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
838  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
839  ret void
840}
841
842define void @load_global_lo_v2f16_reglo_vreg_zexti8(i8 addrspace(1)* %in, i32 %reg) #0 {
843; GFX900-LABEL: load_global_lo_v2f16_reglo_vreg_zexti8:
844; GFX900:       ; %bb.0: ; %entry
845; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
846; GFX900-NEXT:    global_load_ubyte_d16 v2, v[0:1], off offset:-4095
847; GFX900-NEXT:    s_waitcnt vmcnt(0)
848; GFX900-NEXT:    global_store_dword v[0:1], v2, off
849; GFX900-NEXT:    s_waitcnt vmcnt(0)
850; GFX900-NEXT:    s_setpc_b64 s[30:31]
851;
852; GFX906-LABEL: load_global_lo_v2f16_reglo_vreg_zexti8:
853; GFX906:       ; %bb.0: ; %entry
854; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
855; GFX906-NEXT:    global_load_ubyte v0, v[0:1], off offset:-4095
856; GFX906-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
857; GFX906-NEXT:    s_waitcnt vmcnt(0)
858; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
859; GFX906-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
860; GFX906-NEXT:    global_store_dword v[0:1], v0, off
861; GFX906-NEXT:    s_waitcnt vmcnt(0)
862; GFX906-NEXT:    s_setpc_b64 s[30:31]
863;
864; GFX803-LABEL: load_global_lo_v2f16_reglo_vreg_zexti8:
865; GFX803:       ; %bb.0: ; %entry
866; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
867; GFX803-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff001, v0
868; GFX803-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
869; GFX803-NEXT:    flat_load_ubyte v0, v[0:1]
870; GFX803-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
871; GFX803-NEXT:    s_mov_b32 s4, 0x5040c00
872; GFX803-NEXT:    s_waitcnt vmcnt(0)
873; GFX803-NEXT:    v_perm_b32 v0, v1, v0, s4
874; GFX803-NEXT:    flat_store_dword v[0:1], v0
875; GFX803-NEXT:    s_waitcnt vmcnt(0)
876; GFX803-NEXT:    s_setpc_b64 s[30:31]
877entry:
878  %reg.bc = bitcast i32 %reg to <2 x half>
879  %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095
880  %load = load i8, i8 addrspace(1)* %gep
881  %ext = zext i8 %load to i16
882  %bitcast = bitcast i16 %ext to half
883  %build1 = insertelement <2 x half> %reg.bc, half %bitcast, i32 0
884  store <2 x half> %build1, <2 x half> addrspace(1)* undef
885  ret void
886}
887
888define void @load_global_lo_v2f16_reglo_vreg_sexti8(i8 addrspace(1)* %in, i32 %reg) #0 {
889; GFX900-LABEL: load_global_lo_v2f16_reglo_vreg_sexti8:
890; GFX900:       ; %bb.0: ; %entry
891; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
892; GFX900-NEXT:    global_load_sbyte_d16 v2, v[0:1], off offset:-4095
893; GFX900-NEXT:    s_waitcnt vmcnt(0)
894; GFX900-NEXT:    global_store_dword v[0:1], v2, off
895; GFX900-NEXT:    s_waitcnt vmcnt(0)
896; GFX900-NEXT:    s_setpc_b64 s[30:31]
897;
898; GFX906-LABEL: load_global_lo_v2f16_reglo_vreg_sexti8:
899; GFX906:       ; %bb.0: ; %entry
900; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
901; GFX906-NEXT:    global_load_sbyte v0, v[0:1], off offset:-4095
902; GFX906-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
903; GFX906-NEXT:    s_waitcnt vmcnt(0)
904; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
905; GFX906-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
906; GFX906-NEXT:    global_store_dword v[0:1], v0, off
907; GFX906-NEXT:    s_waitcnt vmcnt(0)
908; GFX906-NEXT:    s_setpc_b64 s[30:31]
909;
910; GFX803-LABEL: load_global_lo_v2f16_reglo_vreg_sexti8:
911; GFX803:       ; %bb.0: ; %entry
912; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
913; GFX803-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff001, v0
914; GFX803-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
915; GFX803-NEXT:    flat_load_sbyte v0, v[0:1]
916; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
917; GFX803-NEXT:    s_waitcnt vmcnt(0)
918; GFX803-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
919; GFX803-NEXT:    flat_store_dword v[0:1], v0
920; GFX803-NEXT:    s_waitcnt vmcnt(0)
921; GFX803-NEXT:    s_setpc_b64 s[30:31]
922entry:
923  %reg.bc = bitcast i32 %reg to <2 x half>
924  %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095
925  %load = load i8, i8 addrspace(1)* %gep
926  %ext = sext i8 %load to i16
927  %bitcast = bitcast i16 %ext to half
928  %build1 = insertelement <2 x half> %reg.bc, half %bitcast, i32 0
929  store <2 x half> %build1, <2 x half> addrspace(1)* undef
930  ret void
931}
932
933define void @load_flat_lo_v2i16_reghi_vreg(i16* %in, i32 %reg) #0 {
934; GFX900-LABEL: load_flat_lo_v2i16_reghi_vreg:
935; GFX900:       ; %bb.0: ; %entry
936; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
937; GFX900-NEXT:    flat_load_short_d16 v2, v[0:1]
938; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
939; GFX900-NEXT:    global_store_dword v[0:1], v2, off
940; GFX900-NEXT:    s_waitcnt vmcnt(0)
941; GFX900-NEXT:    s_setpc_b64 s[30:31]
942;
943; GFX906-LABEL: load_flat_lo_v2i16_reghi_vreg:
944; GFX906:       ; %bb.0: ; %entry
945; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
946; GFX906-NEXT:    flat_load_ushort v0, v[0:1]
947; GFX906-NEXT:    v_mov_b32_e32 v1, 0xffff
948; GFX906-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
949; GFX906-NEXT:    v_bfi_b32 v0, v1, v0, v2
950; GFX906-NEXT:    global_store_dword v[0:1], v0, off
951; GFX906-NEXT:    s_waitcnt vmcnt(0)
952; GFX906-NEXT:    s_setpc_b64 s[30:31]
953;
954; GFX803-LABEL: load_flat_lo_v2i16_reghi_vreg:
955; GFX803:       ; %bb.0: ; %entry
956; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
957; GFX803-NEXT:    flat_load_ushort v0, v[0:1]
958; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
959; GFX803-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
960; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
961; GFX803-NEXT:    flat_store_dword v[0:1], v0
962; GFX803-NEXT:    s_waitcnt vmcnt(0)
963; GFX803-NEXT:    s_setpc_b64 s[30:31]
964entry:
965  %reg.bc = bitcast i32 %reg to <2 x i16>
966  %load = load i16, i16* %in
967  %build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0
968  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
969  ret void
970}
971
972define void @load_flat_lo_v2f16_reghi_vreg(half* %in, i32 %reg) #0 {
973; GFX900-LABEL: load_flat_lo_v2f16_reghi_vreg:
974; GFX900:       ; %bb.0: ; %entry
975; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
976; GFX900-NEXT:    flat_load_short_d16 v2, v[0:1]
977; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
978; GFX900-NEXT:    global_store_dword v[0:1], v2, off
979; GFX900-NEXT:    s_waitcnt vmcnt(0)
980; GFX900-NEXT:    s_setpc_b64 s[30:31]
981;
982; GFX906-LABEL: load_flat_lo_v2f16_reghi_vreg:
983; GFX906:       ; %bb.0: ; %entry
984; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
985; GFX906-NEXT:    flat_load_ushort v0, v[0:1]
986; GFX906-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
987; GFX906-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
988; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
989; GFX906-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
990; GFX906-NEXT:    global_store_dword v[0:1], v0, off
991; GFX906-NEXT:    s_waitcnt vmcnt(0)
992; GFX906-NEXT:    s_setpc_b64 s[30:31]
993;
994; GFX803-LABEL: load_flat_lo_v2f16_reghi_vreg:
995; GFX803:       ; %bb.0: ; %entry
996; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
997; GFX803-NEXT:    flat_load_ushort v0, v[0:1]
998; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
999; GFX803-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1000; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
1001; GFX803-NEXT:    flat_store_dword v[0:1], v0
1002; GFX803-NEXT:    s_waitcnt vmcnt(0)
1003; GFX803-NEXT:    s_setpc_b64 s[30:31]
1004
1005; FIXME: the and above should be removable
1006entry:
1007  %reg.bc = bitcast i32 %reg to <2 x half>
1008  %load = load half, half* %in
1009  %build1 = insertelement <2 x half> %reg.bc, half %load, i32 0
1010  store <2 x half> %build1, <2 x half> addrspace(1)* undef
1011  ret void
1012}
1013
1014define void @load_flat_lo_v2i16_reglo_vreg_zexti8(i8* %in, i32 %reg) #0 {
1015; GFX900-LABEL: load_flat_lo_v2i16_reglo_vreg_zexti8:
1016; GFX900:       ; %bb.0: ; %entry
1017; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1018; GFX900-NEXT:    flat_load_ubyte_d16 v2, v[0:1]
1019; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1020; GFX900-NEXT:    global_store_dword v[0:1], v2, off
1021; GFX900-NEXT:    s_waitcnt vmcnt(0)
1022; GFX900-NEXT:    s_setpc_b64 s[30:31]
1023;
1024; GFX906-LABEL: load_flat_lo_v2i16_reglo_vreg_zexti8:
1025; GFX906:       ; %bb.0: ; %entry
1026; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1027; GFX906-NEXT:    flat_load_ubyte v0, v[0:1]
1028; GFX906-NEXT:    v_mov_b32_e32 v1, 0xffff
1029; GFX906-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1030; GFX906-NEXT:    v_bfi_b32 v0, v1, v0, v2
1031; GFX906-NEXT:    global_store_dword v[0:1], v0, off
1032; GFX906-NEXT:    s_waitcnt vmcnt(0)
1033; GFX906-NEXT:    s_setpc_b64 s[30:31]
1034;
1035; GFX803-LABEL: load_flat_lo_v2i16_reglo_vreg_zexti8:
1036; GFX803:       ; %bb.0: ; %entry
1037; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1038; GFX803-NEXT:    flat_load_ubyte v0, v[0:1]
1039; GFX803-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
1040; GFX803-NEXT:    s_mov_b32 s4, 0x5040c00
1041; GFX803-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1042; GFX803-NEXT:    v_perm_b32 v0, v2, v0, s4
1043; GFX803-NEXT:    flat_store_dword v[0:1], v0
1044; GFX803-NEXT:    s_waitcnt vmcnt(0)
1045; GFX803-NEXT:    s_setpc_b64 s[30:31]
1046entry:
1047  %reg.bc = bitcast i32 %reg to <2 x i16>
1048  %load = load i8, i8* %in
1049  %ext = zext i8 %load to i16
1050  %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
1051  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
1052  ret void
1053}
1054
1055define void @load_flat_lo_v2i16_reglo_vreg_sexti8(i8* %in, i32 %reg) #0 {
1056; GFX900-LABEL: load_flat_lo_v2i16_reglo_vreg_sexti8:
1057; GFX900:       ; %bb.0: ; %entry
1058; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1059; GFX900-NEXT:    flat_load_sbyte_d16 v2, v[0:1]
1060; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1061; GFX900-NEXT:    global_store_dword v[0:1], v2, off
1062; GFX900-NEXT:    s_waitcnt vmcnt(0)
1063; GFX900-NEXT:    s_setpc_b64 s[30:31]
1064;
1065; GFX906-LABEL: load_flat_lo_v2i16_reglo_vreg_sexti8:
1066; GFX906:       ; %bb.0: ; %entry
1067; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1068; GFX906-NEXT:    flat_load_sbyte v0, v[0:1]
1069; GFX906-NEXT:    v_mov_b32_e32 v1, 0xffff
1070; GFX906-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1071; GFX906-NEXT:    v_bfi_b32 v0, v1, v0, v2
1072; GFX906-NEXT:    global_store_dword v[0:1], v0, off
1073; GFX906-NEXT:    s_waitcnt vmcnt(0)
1074; GFX906-NEXT:    s_setpc_b64 s[30:31]
1075;
1076; GFX803-LABEL: load_flat_lo_v2i16_reglo_vreg_sexti8:
1077; GFX803:       ; %bb.0: ; %entry
1078; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1079; GFX803-NEXT:    flat_load_sbyte v0, v[0:1]
1080; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
1081; GFX803-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1082; GFX803-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1083; GFX803-NEXT:    flat_store_dword v[0:1], v0
1084; GFX803-NEXT:    s_waitcnt vmcnt(0)
1085; GFX803-NEXT:    s_setpc_b64 s[30:31]
1086entry:
1087  %reg.bc = bitcast i32 %reg to <2 x i16>
1088  %load = load i8, i8* %in
1089  %ext = sext i8 %load to i16
1090  %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
1091  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
1092  ret void
1093}
1094
1095define void @load_flat_lo_v2f16_reglo_vreg_zexti8(i8* %in, i32 %reg) #0 {
1096; GFX900-LABEL: load_flat_lo_v2f16_reglo_vreg_zexti8:
1097; GFX900:       ; %bb.0: ; %entry
1098; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1099; GFX900-NEXT:    flat_load_ubyte_d16 v2, v[0:1]
1100; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1101; GFX900-NEXT:    global_store_dword v[0:1], v2, off
1102; GFX900-NEXT:    s_waitcnt vmcnt(0)
1103; GFX900-NEXT:    s_setpc_b64 s[30:31]
1104;
1105; GFX906-LABEL: load_flat_lo_v2f16_reglo_vreg_zexti8:
1106; GFX906:       ; %bb.0: ; %entry
1107; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1108; GFX906-NEXT:    flat_load_ubyte v0, v[0:1]
1109; GFX906-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
1110; GFX906-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1111; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1112; GFX906-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
1113; GFX906-NEXT:    global_store_dword v[0:1], v0, off
1114; GFX906-NEXT:    s_waitcnt vmcnt(0)
1115; GFX906-NEXT:    s_setpc_b64 s[30:31]
1116;
1117; GFX803-LABEL: load_flat_lo_v2f16_reglo_vreg_zexti8:
1118; GFX803:       ; %bb.0: ; %entry
1119; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1120; GFX803-NEXT:    flat_load_ubyte v0, v[0:1]
1121; GFX803-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
1122; GFX803-NEXT:    s_mov_b32 s4, 0x5040c00
1123; GFX803-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1124; GFX803-NEXT:    v_perm_b32 v0, v2, v0, s4
1125; GFX803-NEXT:    flat_store_dword v[0:1], v0
1126; GFX803-NEXT:    s_waitcnt vmcnt(0)
1127; GFX803-NEXT:    s_setpc_b64 s[30:31]
1128entry:
1129  %reg.bc = bitcast i32 %reg to <2 x half>
1130  %load = load i8, i8* %in
1131  %ext = zext i8 %load to i16
1132  %bitcast = bitcast i16 %ext to half
1133  %build1 = insertelement <2 x half> %reg.bc, half %bitcast, i32 0
1134  store <2 x half> %build1, <2 x half> addrspace(1)* undef
1135  ret void
1136}
1137
1138define void @load_flat_lo_v2f16_reglo_vreg_sexti8(i8* %in, i32 %reg) #0 {
1139; GFX900-LABEL: load_flat_lo_v2f16_reglo_vreg_sexti8:
1140; GFX900:       ; %bb.0: ; %entry
1141; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1142; GFX900-NEXT:    flat_load_sbyte_d16 v2, v[0:1]
1143; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1144; GFX900-NEXT:    global_store_dword v[0:1], v2, off
1145; GFX900-NEXT:    s_waitcnt vmcnt(0)
1146; GFX900-NEXT:    s_setpc_b64 s[30:31]
1147;
1148; GFX906-LABEL: load_flat_lo_v2f16_reglo_vreg_sexti8:
1149; GFX906:       ; %bb.0: ; %entry
1150; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1151; GFX906-NEXT:    flat_load_sbyte v0, v[0:1]
1152; GFX906-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
1153; GFX906-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1154; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1155; GFX906-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
1156; GFX906-NEXT:    global_store_dword v[0:1], v0, off
1157; GFX906-NEXT:    s_waitcnt vmcnt(0)
1158; GFX906-NEXT:    s_setpc_b64 s[30:31]
1159;
1160; GFX803-LABEL: load_flat_lo_v2f16_reglo_vreg_sexti8:
1161; GFX803:       ; %bb.0: ; %entry
1162; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1163; GFX803-NEXT:    flat_load_sbyte v0, v[0:1]
1164; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
1165; GFX803-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1166; GFX803-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1167; GFX803-NEXT:    flat_store_dword v[0:1], v0
1168; GFX803-NEXT:    s_waitcnt vmcnt(0)
1169; GFX803-NEXT:    s_setpc_b64 s[30:31]
1170entry:
1171  %reg.bc = bitcast i32 %reg to <2 x half>
1172  %load = load i8, i8* %in
1173  %ext = sext i8 %load to i16
1174  %bitcast = bitcast i16 %ext to half
1175  %build1 = insertelement <2 x half> %reg.bc, half %bitcast, i32 0
1176  store <2 x half> %build1, <2 x half> addrspace(1)* undef
1177  ret void
1178}
1179
1180define void @load_private_lo_v2i16_reglo_vreg(i16 addrspace(5)* byval(i16) %in, i32 %reg) #0 {
1181; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reglo_vreg:
1182; GFX900-MUBUF:       ; %bb.0: ; %entry
1183; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1184; GFX900-MUBUF-NEXT:    buffer_load_short_d16 v0, off, s[0:3], s32 offset:4094
1185; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
1186; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v0, off
1187; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
1188; GFX900-MUBUF-NEXT:    s_setpc_b64 s[30:31]
1189;
1190; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg:
1191; GFX906:       ; %bb.0: ; %entry
1192; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1193; GFX906-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:4094
1194; GFX906-NEXT:    v_mov_b32_e32 v2, 0xffff
1195; GFX906-NEXT:    s_waitcnt vmcnt(0)
1196; GFX906-NEXT:    v_bfi_b32 v0, v2, v1, v0
1197; GFX906-NEXT:    global_store_dword v[0:1], v0, off
1198; GFX906-NEXT:    s_waitcnt vmcnt(0)
1199; GFX906-NEXT:    s_setpc_b64 s[30:31]
1200;
1201; GFX803-LABEL: load_private_lo_v2i16_reglo_vreg:
1202; GFX803:       ; %bb.0: ; %entry
1203; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1204; GFX803-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:4094
1205; GFX803-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
1206; GFX803-NEXT:    s_waitcnt vmcnt(0)
1207; GFX803-NEXT:    v_or_b32_e32 v0, v1, v0
1208; GFX803-NEXT:    flat_store_dword v[0:1], v0
1209; GFX803-NEXT:    s_waitcnt vmcnt(0)
1210; GFX803-NEXT:    s_setpc_b64 s[30:31]
1211;
1212; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reglo_vreg:
1213; GFX900-FLATSCR:       ; %bb.0: ; %entry
1214; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1215; GFX900-FLATSCR-NEXT:    scratch_load_short_d16 v0, off, s32 offset:4094
1216; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
1217; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
1218; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
1219; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
1220entry:
1221  %reg.bc = bitcast i32 %reg to <2 x i16>
1222  %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i64 2047
1223  %load = load i16, i16 addrspace(5)* %gep
1224  %build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0
1225  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
1226  ret void
1227}
1228
1229define void @load_private_lo_v2i16_reghi_vreg(i16 addrspace(5)* byval(i16) %in, i16 %reg) #0 {
1230; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reghi_vreg:
1231; GFX900-MUBUF:       ; %bb.0: ; %entry
1232; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1233; GFX900-MUBUF-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:4094
1234; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
1235; GFX900-MUBUF-NEXT:    v_and_b32_e32 v1, 0xffff, v1
1236; GFX900-MUBUF-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
1237; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v0, off
1238; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
1239; GFX900-MUBUF-NEXT:    s_setpc_b64 s[30:31]
1240;
1241; GFX906-LABEL: load_private_lo_v2i16_reghi_vreg:
1242; GFX906:       ; %bb.0: ; %entry
1243; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1244; GFX906-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:4094
1245; GFX906-NEXT:    s_waitcnt vmcnt(0)
1246; GFX906-NEXT:    v_and_b32_e32 v1, 0xffff, v1
1247; GFX906-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
1248; GFX906-NEXT:    global_store_dword v[0:1], v0, off
1249; GFX906-NEXT:    s_waitcnt vmcnt(0)
1250; GFX906-NEXT:    s_setpc_b64 s[30:31]
1251;
1252; GFX803-LABEL: load_private_lo_v2i16_reghi_vreg:
1253; GFX803:       ; %bb.0: ; %entry
1254; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1255; GFX803-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:4094
1256; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1257; GFX803-NEXT:    s_waitcnt vmcnt(0)
1258; GFX803-NEXT:    v_or_b32_e32 v0, v1, v0
1259; GFX803-NEXT:    flat_store_dword v[0:1], v0
1260; GFX803-NEXT:    s_waitcnt vmcnt(0)
1261; GFX803-NEXT:    s_setpc_b64 s[30:31]
1262;
1263; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reghi_vreg:
1264; GFX900-FLATSCR:       ; %bb.0: ; %entry
1265; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1266; GFX900-FLATSCR-NEXT:    scratch_load_ushort v1, off, s32 offset:4094
1267; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
1268; GFX900-FLATSCR-NEXT:    v_and_b32_e32 v1, 0xffff, v1
1269; GFX900-FLATSCR-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
1270; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
1271; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
1272; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
1273entry:
1274  %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i64 2047
1275  %load = load i16, i16 addrspace(5)* %gep
1276  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 1
1277  %build1 = insertelement <2 x i16> %build0, i16 %load, i32 0
1278  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
1279  ret void
1280}
1281
1282define void @load_private_lo_v2f16_reglo_vreg(half addrspace(5)* byval(half) %in, i32 %reg) #0 {
1283; GFX900-MUBUF-LABEL: load_private_lo_v2f16_reglo_vreg:
1284; GFX900-MUBUF:       ; %bb.0: ; %entry
1285; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1286; GFX900-MUBUF-NEXT:    buffer_load_short_d16 v0, off, s[0:3], s32 offset:4094
1287; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
1288; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v0, off
1289; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
1290; GFX900-MUBUF-NEXT:    s_setpc_b64 s[30:31]
1291;
1292; GFX906-LABEL: load_private_lo_v2f16_reglo_vreg:
1293; GFX906:       ; %bb.0: ; %entry
1294; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1295; GFX906-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:4094
1296; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1297; GFX906-NEXT:    s_waitcnt vmcnt(0)
1298; GFX906-NEXT:    v_and_b32_e32 v1, 0xffff, v1
1299; GFX906-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
1300; GFX906-NEXT:    global_store_dword v[0:1], v0, off
1301; GFX906-NEXT:    s_waitcnt vmcnt(0)
1302; GFX906-NEXT:    s_setpc_b64 s[30:31]
1303;
1304; GFX803-LABEL: load_private_lo_v2f16_reglo_vreg:
1305; GFX803:       ; %bb.0: ; %entry
1306; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1307; GFX803-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:4094
1308; GFX803-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
1309; GFX803-NEXT:    s_waitcnt vmcnt(0)
1310; GFX803-NEXT:    v_or_b32_e32 v0, v1, v0
1311; GFX803-NEXT:    flat_store_dword v[0:1], v0
1312; GFX803-NEXT:    s_waitcnt vmcnt(0)
1313; GFX803-NEXT:    s_setpc_b64 s[30:31]
1314;
1315; GFX900-FLATSCR-LABEL: load_private_lo_v2f16_reglo_vreg:
1316; GFX900-FLATSCR:       ; %bb.0: ; %entry
1317; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1318; GFX900-FLATSCR-NEXT:    scratch_load_short_d16 v0, off, s32 offset:4094
1319; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
1320; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
1321; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
1322; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
1323entry:
1324  %reg.bc = bitcast i32 %reg to <2 x half>
1325  %gep = getelementptr inbounds half, half addrspace(5)* %in, i64 2047
1326  %load = load half, half addrspace(5)* %gep
1327  %build1 = insertelement <2 x half> %reg.bc, half %load, i32 0
1328  store <2 x half> %build1, <2 x half> addrspace(1)* undef
1329  ret void
1330}
1331
1332define void @load_private_lo_v2i16_reglo_vreg_nooff(i16 addrspace(5)* %in, i32 %reg) #0 {
1333; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reglo_vreg_nooff:
1334; GFX900-MUBUF:       ; %bb.0: ; %entry
1335; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1336; GFX900-MUBUF-NEXT:    buffer_load_short_d16 v1, off, s[0:3], 0 offset:4094 glc
1337; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
1338; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v1, off
1339; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
1340; GFX900-MUBUF-NEXT:    s_setpc_b64 s[30:31]
1341;
1342; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_nooff:
1343; GFX906:       ; %bb.0: ; %entry
1344; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1345; GFX906-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:4094 glc
1346; GFX906-NEXT:    s_waitcnt vmcnt(0)
1347; GFX906-NEXT:    v_mov_b32_e32 v2, 0xffff
1348; GFX906-NEXT:    v_bfi_b32 v0, v2, v0, v1
1349; GFX906-NEXT:    global_store_dword v[0:1], v0, off
1350; GFX906-NEXT:    s_waitcnt vmcnt(0)
1351; GFX906-NEXT:    s_setpc_b64 s[30:31]
1352;
1353; GFX803-LABEL: load_private_lo_v2i16_reglo_vreg_nooff:
1354; GFX803:       ; %bb.0: ; %entry
1355; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1356; GFX803-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:4094 glc
1357; GFX803-NEXT:    s_waitcnt vmcnt(0)
1358; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
1359; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
1360; GFX803-NEXT:    flat_store_dword v[0:1], v0
1361; GFX803-NEXT:    s_waitcnt vmcnt(0)
1362; GFX803-NEXT:    s_setpc_b64 s[30:31]
1363;
1364; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reglo_vreg_nooff:
1365; GFX900-FLATSCR:       ; %bb.0: ; %entry
1366; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1367; GFX900-FLATSCR-NEXT:    s_movk_i32 s0, 0xffe
1368; GFX900-FLATSCR-NEXT:    scratch_load_short_d16 v1, off, s0 glc
1369; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
1370; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v1, off
1371; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
1372; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
1373entry:
1374  %reg.bc = bitcast i32 %reg to <2 x i16>
1375  %load = load volatile i16, i16 addrspace(5)* inttoptr (i32 4094 to i16 addrspace(5)*)
1376  %build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0
1377  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
1378  ret void
1379}
1380
1381define void @load_private_lo_v2i16_reghi_vreg_nooff(i16 addrspace(5)* %in, i32 %reg) #0 {
1382; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reghi_vreg_nooff:
1383; GFX900-MUBUF:       ; %bb.0: ; %entry
1384; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1385; GFX900-MUBUF-NEXT:    buffer_load_short_d16 v1, off, s[0:3], 0 offset:4094 glc
1386; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
1387; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v1, off
1388; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
1389; GFX900-MUBUF-NEXT:    s_setpc_b64 s[30:31]
1390;
1391; GFX906-LABEL: load_private_lo_v2i16_reghi_vreg_nooff:
1392; GFX906:       ; %bb.0: ; %entry
1393; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1394; GFX906-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:4094 glc
1395; GFX906-NEXT:    s_waitcnt vmcnt(0)
1396; GFX906-NEXT:    v_mov_b32_e32 v2, 0xffff
1397; GFX906-NEXT:    v_bfi_b32 v0, v2, v0, v1
1398; GFX906-NEXT:    global_store_dword v[0:1], v0, off
1399; GFX906-NEXT:    s_waitcnt vmcnt(0)
1400; GFX906-NEXT:    s_setpc_b64 s[30:31]
1401;
1402; GFX803-LABEL: load_private_lo_v2i16_reghi_vreg_nooff:
1403; GFX803:       ; %bb.0: ; %entry
1404; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1405; GFX803-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:4094 glc
1406; GFX803-NEXT:    s_waitcnt vmcnt(0)
1407; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
1408; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
1409; GFX803-NEXT:    flat_store_dword v[0:1], v0
1410; GFX803-NEXT:    s_waitcnt vmcnt(0)
1411; GFX803-NEXT:    s_setpc_b64 s[30:31]
1412;
1413; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reghi_vreg_nooff:
1414; GFX900-FLATSCR:       ; %bb.0: ; %entry
1415; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1416; GFX900-FLATSCR-NEXT:    s_movk_i32 s0, 0xffe
1417; GFX900-FLATSCR-NEXT:    scratch_load_short_d16 v1, off, s0 glc
1418; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
1419; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v1, off
1420; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
1421; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
1422entry:
1423  %reg.bc = bitcast i32 %reg to <2 x i16>
1424  %load = load volatile i16, i16 addrspace(5)* inttoptr (i32 4094 to i16 addrspace(5)*)
1425  %build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0
1426  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
1427  ret void
1428}
1429
1430define void @load_private_lo_v2f16_reglo_vreg_nooff(half addrspace(5)* %in, i32 %reg) #0 {
1431; GFX900-MUBUF-LABEL: load_private_lo_v2f16_reglo_vreg_nooff:
1432; GFX900-MUBUF:       ; %bb.0: ; %entry
1433; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1434; GFX900-MUBUF-NEXT:    buffer_load_short_d16 v1, off, s[0:3], 0 offset:4094 glc
1435; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
1436; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v1, off
1437; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
1438; GFX900-MUBUF-NEXT:    s_setpc_b64 s[30:31]
1439;
1440; GFX906-LABEL: load_private_lo_v2f16_reglo_vreg_nooff:
1441; GFX906:       ; %bb.0: ; %entry
1442; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1443; GFX906-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:4094 glc
1444; GFX906-NEXT:    s_waitcnt vmcnt(0)
1445; GFX906-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1446; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1447; GFX906-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
1448; GFX906-NEXT:    global_store_dword v[0:1], v0, off
1449; GFX906-NEXT:    s_waitcnt vmcnt(0)
1450; GFX906-NEXT:    s_setpc_b64 s[30:31]
1451;
1452; GFX803-LABEL: load_private_lo_v2f16_reglo_vreg_nooff:
1453; GFX803:       ; %bb.0: ; %entry
1454; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1455; GFX803-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:4094 glc
1456; GFX803-NEXT:    s_waitcnt vmcnt(0)
1457; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
1458; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
1459; GFX803-NEXT:    flat_store_dword v[0:1], v0
1460; GFX803-NEXT:    s_waitcnt vmcnt(0)
1461; GFX803-NEXT:    s_setpc_b64 s[30:31]
1462;
1463; GFX900-FLATSCR-LABEL: load_private_lo_v2f16_reglo_vreg_nooff:
1464; GFX900-FLATSCR:       ; %bb.0: ; %entry
1465; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1466; GFX900-FLATSCR-NEXT:    s_movk_i32 s0, 0xffe
1467; GFX900-FLATSCR-NEXT:    scratch_load_short_d16 v1, off, s0 glc
1468; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
1469; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v1, off
1470; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
1471; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
1472entry:
1473  %reg.bc = bitcast i32 %reg to <2 x half>
1474  %load = load volatile half, half addrspace(5)* inttoptr (i32 4094 to half addrspace(5)*)
1475  %build1 = insertelement <2 x half> %reg.bc, half %load, i32 0
1476  store <2 x half> %build1, <2 x half> addrspace(1)* undef
1477  ret void
1478}
1479
1480define void @load_private_lo_v2i16_reglo_vreg_zexti8(i8 addrspace(5)* byval(i8) %in, i32 %reg) #0 {
1481; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reglo_vreg_zexti8:
1482; GFX900-MUBUF:       ; %bb.0: ; %entry
1483; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1484; GFX900-MUBUF-NEXT:    buffer_load_ubyte_d16 v0, off, s[0:3], s32 offset:4095
1485; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
1486; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v0, off
1487; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
1488; GFX900-MUBUF-NEXT:    s_setpc_b64 s[30:31]
1489;
1490; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_zexti8:
1491; GFX906:       ; %bb.0: ; %entry
1492; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1493; GFX906-NEXT:    buffer_load_ubyte v1, off, s[0:3], s32 offset:4095
1494; GFX906-NEXT:    v_mov_b32_e32 v2, 0xffff
1495; GFX906-NEXT:    s_waitcnt vmcnt(0)
1496; GFX906-NEXT:    v_bfi_b32 v0, v2, v1, v0
1497; GFX906-NEXT:    global_store_dword v[0:1], v0, off
1498; GFX906-NEXT:    s_waitcnt vmcnt(0)
1499; GFX906-NEXT:    s_setpc_b64 s[30:31]
1500;
1501; GFX803-LABEL: load_private_lo_v2i16_reglo_vreg_zexti8:
1502; GFX803:       ; %bb.0: ; %entry
1503; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1504; GFX803-NEXT:    buffer_load_ubyte v1, off, s[0:3], s32 offset:4095
1505; GFX803-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1506; GFX803-NEXT:    s_mov_b32 s4, 0x5040c00
1507; GFX803-NEXT:    s_waitcnt vmcnt(0)
1508; GFX803-NEXT:    v_perm_b32 v0, v0, v1, s4
1509; GFX803-NEXT:    flat_store_dword v[0:1], v0
1510; GFX803-NEXT:    s_waitcnt vmcnt(0)
1511; GFX803-NEXT:    s_setpc_b64 s[30:31]
1512;
1513; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reglo_vreg_zexti8:
1514; GFX900-FLATSCR:       ; %bb.0: ; %entry
1515; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1516; GFX900-FLATSCR-NEXT:    scratch_load_ubyte_d16 v0, off, s32 offset:4095
1517; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
1518; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
1519; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
1520; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
1521entry:
1522  %reg.bc = bitcast i32 %reg to <2 x i16>
1523  %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095
1524  %load = load i8, i8 addrspace(5)* %gep
1525  %ext = zext i8 %load to i16
1526  %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
1527  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
1528  ret void
1529}
1530
1531define void @load_private_lo_v2i16_reglo_vreg_sexti8(i8 addrspace(5)* byval(i8) %in, i32 %reg) #0 {
1532; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reglo_vreg_sexti8:
1533; GFX900-MUBUF:       ; %bb.0: ; %entry
1534; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1535; GFX900-MUBUF-NEXT:    buffer_load_sbyte_d16 v0, off, s[0:3], s32 offset:4095
1536; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
1537; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v0, off
1538; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
1539; GFX900-MUBUF-NEXT:    s_setpc_b64 s[30:31]
1540;
1541; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_sexti8:
1542; GFX906:       ; %bb.0: ; %entry
1543; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1544; GFX906-NEXT:    buffer_load_sbyte v1, off, s[0:3], s32 offset:4095
1545; GFX906-NEXT:    v_mov_b32_e32 v2, 0xffff
1546; GFX906-NEXT:    s_waitcnt vmcnt(0)
1547; GFX906-NEXT:    v_bfi_b32 v0, v2, v1, v0
1548; GFX906-NEXT:    global_store_dword v[0:1], v0, off
1549; GFX906-NEXT:    s_waitcnt vmcnt(0)
1550; GFX906-NEXT:    s_setpc_b64 s[30:31]
1551;
1552; GFX803-LABEL: load_private_lo_v2i16_reglo_vreg_sexti8:
1553; GFX803:       ; %bb.0: ; %entry
1554; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1555; GFX803-NEXT:    buffer_load_sbyte v1, off, s[0:3], s32 offset:4095
1556; GFX803-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
1557; GFX803-NEXT:    s_waitcnt vmcnt(0)
1558; GFX803-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1559; GFX803-NEXT:    flat_store_dword v[0:1], v0
1560; GFX803-NEXT:    s_waitcnt vmcnt(0)
1561; GFX803-NEXT:    s_setpc_b64 s[30:31]
1562;
1563; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reglo_vreg_sexti8:
1564; GFX900-FLATSCR:       ; %bb.0: ; %entry
1565; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1566; GFX900-FLATSCR-NEXT:    scratch_load_sbyte_d16 v0, off, s32 offset:4095
1567; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
1568; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
1569; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
1570; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
1571entry:
1572  %reg.bc = bitcast i32 %reg to <2 x i16>
1573  %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095
1574  %load = load i8, i8 addrspace(5)* %gep
1575  %ext = sext i8 %load to i16
1576  %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
1577  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
1578  ret void
1579}
1580
1581define void @load_private_lo_v2i16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, i32 %reg) #0 {
1582; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_zexti8:
1583; GFX900-MUBUF:       ; %bb.0: ; %entry
1584; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1585; GFX900-MUBUF-NEXT:    buffer_load_ubyte_d16 v1, off, s[0:3], 0 offset:4094 glc
1586; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
1587; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v1, off
1588; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
1589; GFX900-MUBUF-NEXT:    s_setpc_b64 s[30:31]
1590;
1591; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_zexti8:
1592; GFX906:       ; %bb.0: ; %entry
1593; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1594; GFX906-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 offset:4094 glc
1595; GFX906-NEXT:    s_waitcnt vmcnt(0)
1596; GFX906-NEXT:    v_mov_b32_e32 v2, 0xffff
1597; GFX906-NEXT:    v_bfi_b32 v0, v2, v0, v1
1598; GFX906-NEXT:    global_store_dword v[0:1], v0, off
1599; GFX906-NEXT:    s_waitcnt vmcnt(0)
1600; GFX906-NEXT:    s_setpc_b64 s[30:31]
1601;
1602; GFX803-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_zexti8:
1603; GFX803:       ; %bb.0: ; %entry
1604; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1605; GFX803-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
1606; GFX803-NEXT:    buffer_load_ubyte v1, off, s[0:3], 0 offset:4094 glc
1607; GFX803-NEXT:    s_waitcnt vmcnt(0)
1608; GFX803-NEXT:    s_mov_b32 s4, 0x5040c00
1609; GFX803-NEXT:    v_perm_b32 v0, v0, v1, s4
1610; GFX803-NEXT:    flat_store_dword v[0:1], v0
1611; GFX803-NEXT:    s_waitcnt vmcnt(0)
1612; GFX803-NEXT:    s_setpc_b64 s[30:31]
1613;
1614; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_zexti8:
1615; GFX900-FLATSCR:       ; %bb.0: ; %entry
1616; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1617; GFX900-FLATSCR-NEXT:    s_movk_i32 s0, 0xffe
1618; GFX900-FLATSCR-NEXT:    scratch_load_ubyte_d16 v1, off, s0 glc
1619; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
1620; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v1, off
1621; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
1622; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
1623entry:
1624  %reg.bc = bitcast i32 %reg to <2 x i16>
1625  %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*)
1626  %ext = zext i8 %load to i16
1627  %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
1628  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
1629  ret void
1630}
1631
1632define void @load_private_lo_v2i16_reglo_vreg_nooff_sexti8(i8 addrspace(5)* %in, i32 %reg) #0 {
1633; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_sexti8:
1634; GFX900-MUBUF:       ; %bb.0: ; %entry
1635; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1636; GFX900-MUBUF-NEXT:    buffer_load_sbyte_d16 v1, off, s[0:3], 0 offset:4094 glc
1637; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
1638; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v1, off
1639; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
1640; GFX900-MUBUF-NEXT:    s_setpc_b64 s[30:31]
1641;
1642; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_sexti8:
1643; GFX906:       ; %bb.0: ; %entry
1644; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1645; GFX906-NEXT:    buffer_load_sbyte v0, off, s[0:3], 0 offset:4094 glc
1646; GFX906-NEXT:    s_waitcnt vmcnt(0)
1647; GFX906-NEXT:    v_mov_b32_e32 v2, 0xffff
1648; GFX906-NEXT:    v_bfi_b32 v0, v2, v0, v1
1649; GFX906-NEXT:    global_store_dword v[0:1], v0, off
1650; GFX906-NEXT:    s_waitcnt vmcnt(0)
1651; GFX906-NEXT:    s_setpc_b64 s[30:31]
1652;
1653; GFX803-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_sexti8:
1654; GFX803:       ; %bb.0: ; %entry
1655; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1656; GFX803-NEXT:    buffer_load_sbyte v0, off, s[0:3], 0 offset:4094 glc
1657; GFX803-NEXT:    s_waitcnt vmcnt(0)
1658; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
1659; GFX803-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1660; GFX803-NEXT:    flat_store_dword v[0:1], v0
1661; GFX803-NEXT:    s_waitcnt vmcnt(0)
1662; GFX803-NEXT:    s_setpc_b64 s[30:31]
1663;
1664; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_sexti8:
1665; GFX900-FLATSCR:       ; %bb.0: ; %entry
1666; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1667; GFX900-FLATSCR-NEXT:    s_movk_i32 s0, 0xffe
1668; GFX900-FLATSCR-NEXT:    scratch_load_sbyte_d16 v1, off, s0 glc
1669; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
1670; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v1, off
1671; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
1672; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
1673entry:
1674  %reg.bc = bitcast i32 %reg to <2 x i16>
1675  %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*)
1676  %ext = sext i8 %load to i16
1677  %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
1678  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
1679  ret void
1680}
1681
1682define void @load_private_lo_v2f16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, i32 %reg) #0 {
1683; GFX900-MUBUF-LABEL: load_private_lo_v2f16_reglo_vreg_nooff_zexti8:
1684; GFX900-MUBUF:       ; %bb.0: ; %entry
1685; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1686; GFX900-MUBUF-NEXT:    buffer_load_ubyte_d16 v1, off, s[0:3], 0 offset:4094 glc
1687; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
1688; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v1, off
1689; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
1690; GFX900-MUBUF-NEXT:    s_setpc_b64 s[30:31]
1691;
1692; GFX906-LABEL: load_private_lo_v2f16_reglo_vreg_nooff_zexti8:
1693; GFX906:       ; %bb.0: ; %entry
1694; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1695; GFX906-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 offset:4094 glc
1696; GFX906-NEXT:    s_waitcnt vmcnt(0)
1697; GFX906-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1698; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1699; GFX906-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
1700; GFX906-NEXT:    global_store_dword v[0:1], v0, off
1701; GFX906-NEXT:    s_waitcnt vmcnt(0)
1702; GFX906-NEXT:    s_setpc_b64 s[30:31]
1703;
1704; GFX803-LABEL: load_private_lo_v2f16_reglo_vreg_nooff_zexti8:
1705; GFX803:       ; %bb.0: ; %entry
1706; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1707; GFX803-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
1708; GFX803-NEXT:    buffer_load_ubyte v1, off, s[0:3], 0 offset:4094 glc
1709; GFX803-NEXT:    s_waitcnt vmcnt(0)
1710; GFX803-NEXT:    s_mov_b32 s4, 0x5040c00
1711; GFX803-NEXT:    v_perm_b32 v0, v0, v1, s4
1712; GFX803-NEXT:    flat_store_dword v[0:1], v0
1713; GFX803-NEXT:    s_waitcnt vmcnt(0)
1714; GFX803-NEXT:    s_setpc_b64 s[30:31]
1715;
1716; GFX900-FLATSCR-LABEL: load_private_lo_v2f16_reglo_vreg_nooff_zexti8:
1717; GFX900-FLATSCR:       ; %bb.0: ; %entry
1718; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1719; GFX900-FLATSCR-NEXT:    s_movk_i32 s0, 0xffe
1720; GFX900-FLATSCR-NEXT:    scratch_load_ubyte_d16 v1, off, s0 glc
1721; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
1722; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v1, off
1723; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
1724; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
1725entry:
1726  %reg.bc = bitcast i32 %reg to <2 x half>
1727  %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*)
1728  %ext = zext i8 %load to i16
1729  %bc.ext = bitcast i16 %ext to half
1730  %build1 = insertelement <2 x half> %reg.bc, half %bc.ext, i32 0
1731  store <2 x half> %build1, <2 x half> addrspace(1)* undef
1732  ret void
1733}
1734
1735define void @load_constant_lo_v2i16_reglo_vreg(i16 addrspace(4)* %in, i32 %reg) #0 {
1736; GFX900-LABEL: load_constant_lo_v2i16_reglo_vreg:
1737; GFX900:       ; %bb.0: ; %entry
1738; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1739; GFX900-NEXT:    global_load_short_d16 v2, v[0:1], off offset:-4094
1740; GFX900-NEXT:    s_waitcnt vmcnt(0)
1741; GFX900-NEXT:    global_store_dword v[0:1], v2, off
1742; GFX900-NEXT:    s_waitcnt vmcnt(0)
1743; GFX900-NEXT:    s_setpc_b64 s[30:31]
1744;
1745; GFX906-LABEL: load_constant_lo_v2i16_reglo_vreg:
1746; GFX906:       ; %bb.0: ; %entry
1747; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1748; GFX906-NEXT:    global_load_ushort v0, v[0:1], off offset:-4094
1749; GFX906-NEXT:    v_mov_b32_e32 v1, 0xffff
1750; GFX906-NEXT:    s_waitcnt vmcnt(0)
1751; GFX906-NEXT:    v_bfi_b32 v0, v1, v0, v2
1752; GFX906-NEXT:    global_store_dword v[0:1], v0, off
1753; GFX906-NEXT:    s_waitcnt vmcnt(0)
1754; GFX906-NEXT:    s_setpc_b64 s[30:31]
1755;
1756; GFX803-LABEL: load_constant_lo_v2i16_reglo_vreg:
1757; GFX803:       ; %bb.0: ; %entry
1758; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1759; GFX803-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff002, v0
1760; GFX803-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
1761; GFX803-NEXT:    flat_load_ushort v0, v[0:1]
1762; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
1763; GFX803-NEXT:    s_waitcnt vmcnt(0)
1764; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
1765; GFX803-NEXT:    flat_store_dword v[0:1], v0
1766; GFX803-NEXT:    s_waitcnt vmcnt(0)
1767; GFX803-NEXT:    s_setpc_b64 s[30:31]
1768entry:
1769  %reg.bc = bitcast i32 %reg to <2 x i16>
1770  %gep = getelementptr inbounds i16, i16 addrspace(4)* %in, i64 -2047
1771  %load = load i16, i16 addrspace(4)* %gep
1772  %build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0
1773  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
1774  ret void
1775}
1776
1777define void @load_constant_lo_v2f16_reglo_vreg(half addrspace(4)* %in, i32 %reg) #0 {
1778; GFX900-LABEL: load_constant_lo_v2f16_reglo_vreg:
1779; GFX900:       ; %bb.0: ; %entry
1780; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1781; GFX900-NEXT:    global_load_short_d16 v2, v[0:1], off offset:-4094
1782; GFX900-NEXT:    s_waitcnt vmcnt(0)
1783; GFX900-NEXT:    global_store_dword v[0:1], v2, off
1784; GFX900-NEXT:    s_waitcnt vmcnt(0)
1785; GFX900-NEXT:    s_setpc_b64 s[30:31]
1786;
1787; GFX906-LABEL: load_constant_lo_v2f16_reglo_vreg:
1788; GFX906:       ; %bb.0: ; %entry
1789; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1790; GFX906-NEXT:    global_load_ushort v0, v[0:1], off offset:-4094
1791; GFX906-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
1792; GFX906-NEXT:    s_waitcnt vmcnt(0)
1793; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1794; GFX906-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
1795; GFX906-NEXT:    global_store_dword v[0:1], v0, off
1796; GFX906-NEXT:    s_waitcnt vmcnt(0)
1797; GFX906-NEXT:    s_setpc_b64 s[30:31]
1798;
1799; GFX803-LABEL: load_constant_lo_v2f16_reglo_vreg:
1800; GFX803:       ; %bb.0: ; %entry
1801; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1802; GFX803-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff002, v0
1803; GFX803-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
1804; GFX803-NEXT:    flat_load_ushort v0, v[0:1]
1805; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
1806; GFX803-NEXT:    s_waitcnt vmcnt(0)
1807; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
1808; GFX803-NEXT:    flat_store_dword v[0:1], v0
1809; GFX803-NEXT:    s_waitcnt vmcnt(0)
1810; GFX803-NEXT:    s_setpc_b64 s[30:31]
1811entry:
1812  %reg.bc = bitcast i32 %reg to <2 x half>
1813  %gep = getelementptr inbounds half, half addrspace(4)* %in, i64 -2047
1814  %load = load half, half addrspace(4)* %gep
1815  %build1 = insertelement <2 x half> %reg.bc, half %load, i32 0
1816  store <2 x half> %build1, <2 x half> addrspace(1)* undef
1817  ret void
1818}
1819
1820define void @load_constant_lo_v2f16_reglo_vreg_zexti8(i8 addrspace(4)* %in, i32 %reg) #0 {
1821; GFX900-LABEL: load_constant_lo_v2f16_reglo_vreg_zexti8:
1822; GFX900:       ; %bb.0: ; %entry
1823; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1824; GFX900-NEXT:    global_load_ubyte_d16 v2, v[0:1], off offset:-4095
1825; GFX900-NEXT:    s_waitcnt vmcnt(0)
1826; GFX900-NEXT:    global_store_dword v[0:1], v2, off
1827; GFX900-NEXT:    s_waitcnt vmcnt(0)
1828; GFX900-NEXT:    s_setpc_b64 s[30:31]
1829;
1830; GFX906-LABEL: load_constant_lo_v2f16_reglo_vreg_zexti8:
1831; GFX906:       ; %bb.0: ; %entry
1832; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1833; GFX906-NEXT:    global_load_ubyte v0, v[0:1], off offset:-4095
1834; GFX906-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
1835; GFX906-NEXT:    s_waitcnt vmcnt(0)
1836; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1837; GFX906-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
1838; GFX906-NEXT:    global_store_dword v[0:1], v0, off
1839; GFX906-NEXT:    s_waitcnt vmcnt(0)
1840; GFX906-NEXT:    s_setpc_b64 s[30:31]
1841;
1842; GFX803-LABEL: load_constant_lo_v2f16_reglo_vreg_zexti8:
1843; GFX803:       ; %bb.0: ; %entry
1844; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1845; GFX803-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff001, v0
1846; GFX803-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
1847; GFX803-NEXT:    flat_load_ubyte v0, v[0:1]
1848; GFX803-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
1849; GFX803-NEXT:    s_mov_b32 s4, 0x5040c00
1850; GFX803-NEXT:    s_waitcnt vmcnt(0)
1851; GFX803-NEXT:    v_perm_b32 v0, v1, v0, s4
1852; GFX803-NEXT:    flat_store_dword v[0:1], v0
1853; GFX803-NEXT:    s_waitcnt vmcnt(0)
1854; GFX803-NEXT:    s_setpc_b64 s[30:31]
1855entry:
1856  %reg.bc = bitcast i32 %reg to <2 x half>
1857  %gep = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 -4095
1858  %load = load i8, i8 addrspace(4)* %gep
1859  %ext = zext i8 %load to i16
1860  %bitcast = bitcast i16 %ext to half
1861  %build1 = insertelement <2 x half> %reg.bc, half %bitcast, i32 0
1862  store <2 x half> %build1, <2 x half> addrspace(1)* undef
1863  ret void
1864}
1865
1866define void @load_constant_lo_v2f16_reglo_vreg_sexti8(i8 addrspace(4)* %in, i32 %reg) #0 {
1867; GFX900-LABEL: load_constant_lo_v2f16_reglo_vreg_sexti8:
1868; GFX900:       ; %bb.0: ; %entry
1869; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1870; GFX900-NEXT:    global_load_sbyte_d16 v2, v[0:1], off offset:-4095
1871; GFX900-NEXT:    s_waitcnt vmcnt(0)
1872; GFX900-NEXT:    global_store_dword v[0:1], v2, off
1873; GFX900-NEXT:    s_waitcnt vmcnt(0)
1874; GFX900-NEXT:    s_setpc_b64 s[30:31]
1875;
1876; GFX906-LABEL: load_constant_lo_v2f16_reglo_vreg_sexti8:
1877; GFX906:       ; %bb.0: ; %entry
1878; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1879; GFX906-NEXT:    global_load_sbyte v0, v[0:1], off offset:-4095
1880; GFX906-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
1881; GFX906-NEXT:    s_waitcnt vmcnt(0)
1882; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1883; GFX906-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
1884; GFX906-NEXT:    global_store_dword v[0:1], v0, off
1885; GFX906-NEXT:    s_waitcnt vmcnt(0)
1886; GFX906-NEXT:    s_setpc_b64 s[30:31]
1887;
1888; GFX803-LABEL: load_constant_lo_v2f16_reglo_vreg_sexti8:
1889; GFX803:       ; %bb.0: ; %entry
1890; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1891; GFX803-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff001, v0
1892; GFX803-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
1893; GFX803-NEXT:    flat_load_sbyte v0, v[0:1]
1894; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
1895; GFX803-NEXT:    s_waitcnt vmcnt(0)
1896; GFX803-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1897; GFX803-NEXT:    flat_store_dword v[0:1], v0
1898; GFX803-NEXT:    s_waitcnt vmcnt(0)
1899; GFX803-NEXT:    s_setpc_b64 s[30:31]
1900entry:
1901  %reg.bc = bitcast i32 %reg to <2 x half>
1902  %gep = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 -4095
1903  %load = load i8, i8 addrspace(4)* %gep
1904  %ext = sext i8 %load to i16
1905  %bitcast = bitcast i16 %ext to half
1906  %build1 = insertelement <2 x half> %reg.bc, half %bitcast, i32 0
1907  store <2 x half> %build1, <2 x half> addrspace(1)* undef
1908  ret void
1909}
1910
1911define void @load_private_lo_v2i16_reglo_vreg_to_offset(i32 %reg) #0 {
1912; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reglo_vreg_to_offset:
1913; GFX900-MUBUF:       ; %bb.0: ; %entry
1914; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1915; GFX900-MUBUF-NEXT:    v_mov_b32_e32 v1, 0x7b
1916; GFX900-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s32
1917; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
1918; GFX900-MUBUF-NEXT:    buffer_load_short_d16 v0, off, s[0:3], s32 offset:4094 glc
1919; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
1920; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v0, off
1921; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
1922; GFX900-MUBUF-NEXT:    s_setpc_b64 s[30:31]
1923;
1924; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_to_offset:
1925; GFX906:       ; %bb.0: ; %entry
1926; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1927; GFX906-NEXT:    v_mov_b32_e32 v1, 0x7b
1928; GFX906-NEXT:    buffer_store_dword v1, off, s[0:3], s32
1929; GFX906-NEXT:    s_waitcnt vmcnt(0)
1930; GFX906-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:4094 glc
1931; GFX906-NEXT:    s_waitcnt vmcnt(0)
1932; GFX906-NEXT:    v_mov_b32_e32 v2, 0xffff
1933; GFX906-NEXT:    v_bfi_b32 v0, v2, v1, v0
1934; GFX906-NEXT:    global_store_dword v[0:1], v0, off
1935; GFX906-NEXT:    s_waitcnt vmcnt(0)
1936; GFX906-NEXT:    s_setpc_b64 s[30:31]
1937;
1938; GFX803-LABEL: load_private_lo_v2i16_reglo_vreg_to_offset:
1939; GFX803:       ; %bb.0: ; %entry
1940; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1941; GFX803-NEXT:    v_mov_b32_e32 v1, 0x7b
1942; GFX803-NEXT:    buffer_store_dword v1, off, s[0:3], s32
1943; GFX803-NEXT:    s_waitcnt vmcnt(0)
1944; GFX803-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:4094 glc
1945; GFX803-NEXT:    s_waitcnt vmcnt(0)
1946; GFX803-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
1947; GFX803-NEXT:    v_or_b32_e32 v0, v1, v0
1948; GFX803-NEXT:    flat_store_dword v[0:1], v0
1949; GFX803-NEXT:    s_waitcnt vmcnt(0)
1950; GFX803-NEXT:    s_setpc_b64 s[30:31]
1951;
1952; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reglo_vreg_to_offset:
1953; GFX900-FLATSCR:       ; %bb.0: ; %entry
1954; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1955; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v1, 0x7b
1956; GFX900-FLATSCR-NEXT:    scratch_store_dword off, v1, s32
1957; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
1958; GFX900-FLATSCR-NEXT:    scratch_load_short_d16 v0, off, s32 offset:4094 glc
1959; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
1960; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
1961; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
1962; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
1963entry:
1964  %obj0 = alloca [10 x i32], align 4, addrspace(5)
1965  %obj1 = alloca [4096 x i16], align 2, addrspace(5)
1966  %reg.bc = bitcast i32 %reg to <2 x i16>
1967  %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
1968  store volatile i32 123, i32 addrspace(5)* %bc
1969  %gep = getelementptr inbounds [4096 x i16], [4096 x i16] addrspace(5)* %obj1, i32 0, i32 2027
1970  %load = load volatile i16, i16 addrspace(5)* %gep
1971  %build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0
1972  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
1973  ret void
1974}
1975
1976define void @load_private_lo_v2i16_reglo_vreg_sexti8_to_offset(i32 %reg) #0 {
1977; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reglo_vreg_sexti8_to_offset:
1978; GFX900-MUBUF:       ; %bb.0: ; %entry
1979; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1980; GFX900-MUBUF-NEXT:    v_mov_b32_e32 v1, 0x7b
1981; GFX900-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s32
1982; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
1983; GFX900-MUBUF-NEXT:    buffer_load_sbyte_d16 v0, off, s[0:3], s32 offset:4095 glc
1984; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
1985; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v0, off
1986; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
1987; GFX900-MUBUF-NEXT:    s_setpc_b64 s[30:31]
1988;
1989; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_sexti8_to_offset:
1990; GFX906:       ; %bb.0: ; %entry
1991; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1992; GFX906-NEXT:    v_mov_b32_e32 v1, 0x7b
1993; GFX906-NEXT:    buffer_store_dword v1, off, s[0:3], s32
1994; GFX906-NEXT:    s_waitcnt vmcnt(0)
1995; GFX906-NEXT:    buffer_load_sbyte v1, off, s[0:3], s32 offset:4095 glc
1996; GFX906-NEXT:    s_waitcnt vmcnt(0)
1997; GFX906-NEXT:    v_mov_b32_e32 v2, 0xffff
1998; GFX906-NEXT:    v_bfi_b32 v0, v2, v1, v0
1999; GFX906-NEXT:    global_store_dword v[0:1], v0, off
2000; GFX906-NEXT:    s_waitcnt vmcnt(0)
2001; GFX906-NEXT:    s_setpc_b64 s[30:31]
2002;
2003; GFX803-LABEL: load_private_lo_v2i16_reglo_vreg_sexti8_to_offset:
2004; GFX803:       ; %bb.0: ; %entry
2005; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2006; GFX803-NEXT:    v_mov_b32_e32 v1, 0x7b
2007; GFX803-NEXT:    buffer_store_dword v1, off, s[0:3], s32
2008; GFX803-NEXT:    s_waitcnt vmcnt(0)
2009; GFX803-NEXT:    buffer_load_sbyte v1, off, s[0:3], s32 offset:4095 glc
2010; GFX803-NEXT:    s_waitcnt vmcnt(0)
2011; GFX803-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
2012; GFX803-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2013; GFX803-NEXT:    flat_store_dword v[0:1], v0
2014; GFX803-NEXT:    s_waitcnt vmcnt(0)
2015; GFX803-NEXT:    s_setpc_b64 s[30:31]
2016;
2017; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reglo_vreg_sexti8_to_offset:
2018; GFX900-FLATSCR:       ; %bb.0: ; %entry
2019; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2020; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v1, 0x7b
2021; GFX900-FLATSCR-NEXT:    scratch_store_dword off, v1, s32
2022; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
2023; GFX900-FLATSCR-NEXT:    scratch_load_sbyte_d16 v0, off, s32 offset:4095 glc
2024; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
2025; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
2026; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
2027; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
2028entry:
2029  %obj0 = alloca [10 x i32], align 4, addrspace(5)
2030  %obj1 = alloca [4096 x i8], align 2, addrspace(5)
2031  %reg.bc = bitcast i32 %reg to <2 x i16>
2032  %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
2033  store volatile i32 123, i32 addrspace(5)* %bc
2034  %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4055
2035  %load = load volatile i8, i8 addrspace(5)* %gep
2036  %load.ext = sext i8 %load to i16
2037  %build1 = insertelement <2 x i16> %reg.bc, i16 %load.ext, i32 0
2038  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
2039  ret void
2040}
2041
2042define void @load_private_lo_v2i16_reglo_vreg_zexti8_to_offset(i32 %reg) #0 {
2043; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reglo_vreg_zexti8_to_offset:
2044; GFX900-MUBUF:       ; %bb.0: ; %entry
2045; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2046; GFX900-MUBUF-NEXT:    v_mov_b32_e32 v1, 0x7b
2047; GFX900-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s32
2048; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
2049; GFX900-MUBUF-NEXT:    buffer_load_ubyte_d16 v0, off, s[0:3], s32 offset:4095 glc
2050; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
2051; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v0, off
2052; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
2053; GFX900-MUBUF-NEXT:    s_setpc_b64 s[30:31]
2054;
2055; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_zexti8_to_offset:
2056; GFX906:       ; %bb.0: ; %entry
2057; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2058; GFX906-NEXT:    v_mov_b32_e32 v1, 0x7b
2059; GFX906-NEXT:    buffer_store_dword v1, off, s[0:3], s32
2060; GFX906-NEXT:    s_waitcnt vmcnt(0)
2061; GFX906-NEXT:    buffer_load_ubyte v1, off, s[0:3], s32 offset:4095 glc
2062; GFX906-NEXT:    s_waitcnt vmcnt(0)
2063; GFX906-NEXT:    v_mov_b32_e32 v2, 0xffff
2064; GFX906-NEXT:    v_bfi_b32 v0, v2, v1, v0
2065; GFX906-NEXT:    global_store_dword v[0:1], v0, off
2066; GFX906-NEXT:    s_waitcnt vmcnt(0)
2067; GFX906-NEXT:    s_setpc_b64 s[30:31]
2068;
2069; GFX803-LABEL: load_private_lo_v2i16_reglo_vreg_zexti8_to_offset:
2070; GFX803:       ; %bb.0: ; %entry
2071; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2072; GFX803-NEXT:    v_mov_b32_e32 v1, 0x7b
2073; GFX803-NEXT:    buffer_store_dword v1, off, s[0:3], s32
2074; GFX803-NEXT:    s_waitcnt vmcnt(0)
2075; GFX803-NEXT:    buffer_load_ubyte v1, off, s[0:3], s32 offset:4095 glc
2076; GFX803-NEXT:    s_waitcnt vmcnt(0)
2077; GFX803-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
2078; GFX803-NEXT:    s_mov_b32 s4, 0x5040c00
2079; GFX803-NEXT:    v_perm_b32 v0, v0, v1, s4
2080; GFX803-NEXT:    flat_store_dword v[0:1], v0
2081; GFX803-NEXT:    s_waitcnt vmcnt(0)
2082; GFX803-NEXT:    s_setpc_b64 s[30:31]
2083;
2084; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reglo_vreg_zexti8_to_offset:
2085; GFX900-FLATSCR:       ; %bb.0: ; %entry
2086; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2087; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v1, 0x7b
2088; GFX900-FLATSCR-NEXT:    scratch_store_dword off, v1, s32
2089; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
2090; GFX900-FLATSCR-NEXT:    scratch_load_ubyte_d16 v0, off, s32 offset:4095 glc
2091; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
2092; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
2093; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
2094; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
2095entry:
2096  %obj0 = alloca [10 x i32], align 4, addrspace(5)
2097  %obj1 = alloca [4096 x i8], align 2, addrspace(5)
2098  %reg.bc = bitcast i32 %reg to <2 x i16>
2099  %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
2100  store volatile i32 123, i32 addrspace(5)* %bc
2101  %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4055
2102  %load = load volatile i8, i8 addrspace(5)* %gep
2103  %load.ext = zext i8 %load to i16
2104  %build1 = insertelement <2 x i16> %reg.bc, i16 %load.ext, i32 0
2105  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
2106  ret void
2107}
2108
2109define void @load_private_lo_v2f16_reglo_vreg_sexti8_to_offset(i32 %reg) #0 {
2110; GFX900-MUBUF-LABEL: load_private_lo_v2f16_reglo_vreg_sexti8_to_offset:
2111; GFX900-MUBUF:       ; %bb.0: ; %entry
2112; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2113; GFX900-MUBUF-NEXT:    v_mov_b32_e32 v1, 0x7b
2114; GFX900-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s32
2115; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
2116; GFX900-MUBUF-NEXT:    buffer_load_sbyte_d16 v0, off, s[0:3], s32 offset:4095 glc
2117; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
2118; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v0, off
2119; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
2120; GFX900-MUBUF-NEXT:    s_setpc_b64 s[30:31]
2121;
2122; GFX906-LABEL: load_private_lo_v2f16_reglo_vreg_sexti8_to_offset:
2123; GFX906:       ; %bb.0: ; %entry
2124; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2125; GFX906-NEXT:    v_mov_b32_e32 v1, 0x7b
2126; GFX906-NEXT:    buffer_store_dword v1, off, s[0:3], s32
2127; GFX906-NEXT:    s_waitcnt vmcnt(0)
2128; GFX906-NEXT:    buffer_load_sbyte v1, off, s[0:3], s32 offset:4095 glc
2129; GFX906-NEXT:    s_waitcnt vmcnt(0)
2130; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
2131; GFX906-NEXT:    v_and_b32_e32 v1, 0xffff, v1
2132; GFX906-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
2133; GFX906-NEXT:    global_store_dword v[0:1], v0, off
2134; GFX906-NEXT:    s_waitcnt vmcnt(0)
2135; GFX906-NEXT:    s_setpc_b64 s[30:31]
2136;
2137; GFX803-LABEL: load_private_lo_v2f16_reglo_vreg_sexti8_to_offset:
2138; GFX803:       ; %bb.0: ; %entry
2139; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2140; GFX803-NEXT:    v_mov_b32_e32 v1, 0x7b
2141; GFX803-NEXT:    buffer_store_dword v1, off, s[0:3], s32
2142; GFX803-NEXT:    s_waitcnt vmcnt(0)
2143; GFX803-NEXT:    buffer_load_sbyte v1, off, s[0:3], s32 offset:4095 glc
2144; GFX803-NEXT:    s_waitcnt vmcnt(0)
2145; GFX803-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
2146; GFX803-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2147; GFX803-NEXT:    flat_store_dword v[0:1], v0
2148; GFX803-NEXT:    s_waitcnt vmcnt(0)
2149; GFX803-NEXT:    s_setpc_b64 s[30:31]
2150;
2151; GFX900-FLATSCR-LABEL: load_private_lo_v2f16_reglo_vreg_sexti8_to_offset:
2152; GFX900-FLATSCR:       ; %bb.0: ; %entry
2153; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2154; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v1, 0x7b
2155; GFX900-FLATSCR-NEXT:    scratch_store_dword off, v1, s32
2156; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
2157; GFX900-FLATSCR-NEXT:    scratch_load_sbyte_d16 v0, off, s32 offset:4095 glc
2158; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
2159; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
2160; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
2161; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
2162entry:
2163  %obj0 = alloca [10 x i32], align 4, addrspace(5)
2164  %obj1 = alloca [4096 x i8], align 2, addrspace(5)
2165  %reg.bc = bitcast i32 %reg to <2 x half>
2166  %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
2167  store volatile i32 123, i32 addrspace(5)* %bc
2168  %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4055
2169  %load = load volatile i8, i8 addrspace(5)* %gep
2170  %load.ext = sext i8 %load to i16
2171  %bitcast = bitcast i16 %load.ext to half
2172  %build1 = insertelement <2 x half> %reg.bc, half %bitcast, i32 0
2173  store <2 x half> %build1, <2 x half> addrspace(1)* undef
2174  ret void
2175}
2176
2177define void @load_private_lo_v2f16_reglo_vreg_zexti8_to_offset(i32 %reg) #0 {
2178; GFX900-MUBUF-LABEL: load_private_lo_v2f16_reglo_vreg_zexti8_to_offset:
2179; GFX900-MUBUF:       ; %bb.0: ; %entry
2180; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2181; GFX900-MUBUF-NEXT:    v_mov_b32_e32 v1, 0x7b
2182; GFX900-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s32
2183; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
2184; GFX900-MUBUF-NEXT:    buffer_load_ubyte_d16 v0, off, s[0:3], s32 offset:4095 glc
2185; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
2186; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v0, off
2187; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
2188; GFX900-MUBUF-NEXT:    s_setpc_b64 s[30:31]
2189;
2190; GFX906-LABEL: load_private_lo_v2f16_reglo_vreg_zexti8_to_offset:
2191; GFX906:       ; %bb.0: ; %entry
2192; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2193; GFX906-NEXT:    v_mov_b32_e32 v1, 0x7b
2194; GFX906-NEXT:    buffer_store_dword v1, off, s[0:3], s32
2195; GFX906-NEXT:    s_waitcnt vmcnt(0)
2196; GFX906-NEXT:    buffer_load_ubyte v1, off, s[0:3], s32 offset:4095 glc
2197; GFX906-NEXT:    s_waitcnt vmcnt(0)
2198; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
2199; GFX906-NEXT:    v_and_b32_e32 v1, 0xffff, v1
2200; GFX906-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
2201; GFX906-NEXT:    global_store_dword v[0:1], v0, off
2202; GFX906-NEXT:    s_waitcnt vmcnt(0)
2203; GFX906-NEXT:    s_setpc_b64 s[30:31]
2204;
2205; GFX803-LABEL: load_private_lo_v2f16_reglo_vreg_zexti8_to_offset:
2206; GFX803:       ; %bb.0: ; %entry
2207; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2208; GFX803-NEXT:    v_mov_b32_e32 v1, 0x7b
2209; GFX803-NEXT:    buffer_store_dword v1, off, s[0:3], s32
2210; GFX803-NEXT:    s_waitcnt vmcnt(0)
2211; GFX803-NEXT:    buffer_load_ubyte v1, off, s[0:3], s32 offset:4095 glc
2212; GFX803-NEXT:    s_waitcnt vmcnt(0)
2213; GFX803-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
2214; GFX803-NEXT:    s_mov_b32 s4, 0x5040c00
2215; GFX803-NEXT:    v_perm_b32 v0, v0, v1, s4
2216; GFX803-NEXT:    flat_store_dword v[0:1], v0
2217; GFX803-NEXT:    s_waitcnt vmcnt(0)
2218; GFX803-NEXT:    s_setpc_b64 s[30:31]
2219;
2220; GFX900-FLATSCR-LABEL: load_private_lo_v2f16_reglo_vreg_zexti8_to_offset:
2221; GFX900-FLATSCR:       ; %bb.0: ; %entry
2222; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2223; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v1, 0x7b
2224; GFX900-FLATSCR-NEXT:    scratch_store_dword off, v1, s32
2225; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
2226; GFX900-FLATSCR-NEXT:    scratch_load_ubyte_d16 v0, off, s32 offset:4095 glc
2227; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
2228; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
2229; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
2230; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
2231entry:
2232  %obj0 = alloca [10 x i32], align 4, addrspace(5)
2233  %obj1 = alloca [4096 x i8], align 2, addrspace(5)
2234  %reg.bc = bitcast i32 %reg to <2 x half>
2235  %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
2236  store volatile i32 123, i32 addrspace(5)* %bc
2237  %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4055
2238  %load = load volatile i8, i8 addrspace(5)* %gep
2239  %load.ext = zext i8 %load to i16
2240  %bitcast = bitcast i16 %load.ext to half
2241  %build1 = insertelement <2 x half> %reg.bc, half %bitcast, i32 0
2242  store <2 x half> %build1, <2 x half> addrspace(1)* undef
2243  ret void
2244}
2245
2246attributes #0 = { nounwind }
2247