1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s
3; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s
4
5declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
6declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
7
8define float @v_uitofp_i32_to_f32_mask255(i32 %arg0) nounwind {
9; SI-LABEL: v_uitofp_i32_to_f32_mask255:
10; SI:       ; %bb.0:
11; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
13; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
14; SI-NEXT:    s_setpc_b64 s[30:31]
15;
16; VI-LABEL: v_uitofp_i32_to_f32_mask255:
17; VI:       ; %bb.0:
18; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
20; VI-NEXT:    s_setpc_b64 s[30:31]
21  %masked = and i32 %arg0, 255
22  %cvt = uitofp i32 %masked to float
23  ret float %cvt
24}
25
26define float @v_sitofp_i32_to_f32_mask255(i32 %arg0) nounwind {
27; SI-LABEL: v_sitofp_i32_to_f32_mask255:
28; SI:       ; %bb.0:
29; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
31; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
32; SI-NEXT:    s_setpc_b64 s[30:31]
33;
34; VI-LABEL: v_sitofp_i32_to_f32_mask255:
35; VI:       ; %bb.0:
36; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
38; VI-NEXT:    s_setpc_b64 s[30:31]
39  %masked = and i32 %arg0, 255
40  %cvt = sitofp i32 %masked to float
41  ret float %cvt
42}
43
44define float @v_uitofp_to_f32_lshr7_mask255(i32 %arg0) nounwind {
45; SI-LABEL: v_uitofp_to_f32_lshr7_mask255:
46; SI:       ; %bb.0:
47; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
48; SI-NEXT:    v_lshrrev_b32_e32 v0, 7, v0
49; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
50; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
51; SI-NEXT:    s_setpc_b64 s[30:31]
52;
53; VI-LABEL: v_uitofp_to_f32_lshr7_mask255:
54; VI:       ; %bb.0:
55; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
56; VI-NEXT:    v_lshrrev_b32_e32 v0, 7, v0
57; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
58; VI-NEXT:    s_setpc_b64 s[30:31]
59  %lshr.7 = lshr i32 %arg0, 7
60  %masked = and i32 %lshr.7, 255
61  %cvt = uitofp i32 %masked to float
62  ret float %cvt
63}
64
65define float @v_uitofp_to_f32_lshr8_mask255(i32 %arg0) nounwind {
66; SI-LABEL: v_uitofp_to_f32_lshr8_mask255:
67; SI:       ; %bb.0:
68; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
69; SI-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
70; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
71; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
72; SI-NEXT:    s_setpc_b64 s[30:31]
73;
74; VI-LABEL: v_uitofp_to_f32_lshr8_mask255:
75; VI:       ; %bb.0:
76; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
77; VI-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
78; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
79; VI-NEXT:    s_setpc_b64 s[30:31]
80  %lshr.8 = lshr i32 %arg0, 8
81  %masked = and i32 %lshr.8, 255
82  %cvt = uitofp i32 %masked to float
83  ret float %cvt
84}
85
86define float @v_uitofp_to_f32_multi_use_lshr8_mask255(i32 %arg0) nounwind {
87; SI-LABEL: v_uitofp_to_f32_multi_use_lshr8_mask255:
88; SI:       ; %bb.0:
89; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
90; SI-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
91; SI-NEXT:    s_mov_b32 s6, -1
92; SI-NEXT:    s_mov_b32 s7, 0xf000
93; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
94; SI-NEXT:    s_waitcnt expcnt(0)
95; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
96; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
97; SI-NEXT:    s_waitcnt vmcnt(0)
98; SI-NEXT:    s_setpc_b64 s[30:31]
99;
100; VI-LABEL: v_uitofp_to_f32_multi_use_lshr8_mask255:
101; VI:       ; %bb.0:
102; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
103; VI-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
104; VI-NEXT:    flat_store_dword v[0:1], v0
105; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
106; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
107; VI-NEXT:    s_setpc_b64 s[30:31]
108  %lshr.8 = lshr i32 %arg0, 8
109  store i32 %lshr.8, i32 addrspace(1)* undef
110  %masked = and i32 %lshr.8, 255
111  %cvt = uitofp i32 %masked to float
112  ret float %cvt
113}
114
115define float @v_uitofp_to_f32_lshr16_mask255(i32 %arg0) nounwind {
116; SI-LABEL: v_uitofp_to_f32_lshr16_mask255:
117; SI:       ; %bb.0:
118; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
119; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
120; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
121; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
122; SI-NEXT:    s_setpc_b64 s[30:31]
123;
124; VI-LABEL: v_uitofp_to_f32_lshr16_mask255:
125; VI:       ; %bb.0:
126; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
127; VI-NEXT:    v_mov_b32_e32 v1, 0xff
128; VI-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
129; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
130; VI-NEXT:    s_setpc_b64 s[30:31]
131  %lshr.16 = lshr i32 %arg0, 16
132  %masked = and i32 %lshr.16, 255
133  %cvt = uitofp i32 %masked to float
134  ret float %cvt
135}
136
137define float @v_uitofp_to_f32_lshr24_mask255(i32 %arg0) nounwind {
138; GCN-LABEL: v_uitofp_to_f32_lshr24_mask255:
139; GCN:       ; %bb.0:
140; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
141; GCN-NEXT:    v_cvt_f32_ubyte3_e32 v0, v0
142; GCN-NEXT:    s_setpc_b64 s[30:31]
143  %lshr.16 = lshr i32 %arg0, 24
144  %masked = and i32 %lshr.16, 255
145  %cvt = uitofp i32 %masked to float
146  ret float %cvt
147}
148
149define float @v_uitofp_i8_to_f32(i8 %arg0) nounwind {
150; SI-LABEL: v_uitofp_i8_to_f32:
151; SI:       ; %bb.0:
152; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
153; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
154; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
155; SI-NEXT:    s_setpc_b64 s[30:31]
156;
157; VI-LABEL: v_uitofp_i8_to_f32:
158; VI:       ; %bb.0:
159; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
160; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
161; VI-NEXT:    s_setpc_b64 s[30:31]
162  %cvt = uitofp i8 %arg0 to float
163  ret float %cvt
164}
165
166define <2 x float> @v_uitofp_v2i8_to_v2f32(i16 %arg0) nounwind {
167; SI-LABEL: v_uitofp_v2i8_to_v2f32:
168; SI:       ; %bb.0:
169; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
170; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
171; SI-NEXT:    s_movk_i32 s4, 0xff
172; SI-NEXT:    v_and_b32_e32 v0, s4, v0
173; SI-NEXT:    v_and_b32_e32 v1, s4, v1
174; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
175; SI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v1
176; SI-NEXT:    s_setpc_b64 s[30:31]
177;
178; VI-LABEL: v_uitofp_v2i8_to_v2f32:
179; VI:       ; %bb.0:
180; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
181; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
182; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
183; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
184; VI-NEXT:    s_setpc_b64 s[30:31]
185  %val = bitcast i16 %arg0 to <2 x i8>
186  %cvt = uitofp <2 x i8> %val to <2 x float>
187  ret <2 x float> %cvt
188}
189
190define <3 x float> @v_uitofp_v3i8_to_v3f32(i32 %arg0) nounwind {
191; SI-LABEL: v_uitofp_v3i8_to_v3f32:
192; SI:       ; %bb.0:
193; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
194; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
195; SI-NEXT:    s_movk_i32 s4, 0xff
196; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
197; SI-NEXT:    v_and_b32_e32 v0, s4, v0
198; SI-NEXT:    v_and_b32_e32 v1, s4, v1
199; SI-NEXT:    v_and_b32_e32 v2, s4, v2
200; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
201; SI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v1
202; SI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v2
203; SI-NEXT:    s_setpc_b64 s[30:31]
204;
205; VI-LABEL: v_uitofp_v3i8_to_v3f32:
206; VI:       ; %bb.0:
207; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
208; VI-NEXT:    s_movk_i32 s4, 0xff
209; VI-NEXT:    v_mov_b32_e32 v2, s4
210; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
211; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
212; VI-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
213; VI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v0
214; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
215; VI-NEXT:    v_mov_b32_e32 v0, v3
216; VI-NEXT:    s_setpc_b64 s[30:31]
217  %trunc = trunc i32 %arg0 to i24
218  %val = bitcast i24 %trunc to <3 x i8>
219  %cvt = uitofp <3 x i8> %val to <3 x float>
220  ret <3 x float> %cvt
221}
222
223define <4 x float> @v_uitofp_v4i8_to_v4f32(i32 %arg0) nounwind {
224; SI-LABEL: v_uitofp_v4i8_to_v4f32:
225; SI:       ; %bb.0:
226; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
227; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
228; SI-NEXT:    s_movk_i32 s4, 0xff
229; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
230; SI-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
231; SI-NEXT:    v_and_b32_e32 v0, s4, v0
232; SI-NEXT:    v_and_b32_e32 v1, s4, v1
233; SI-NEXT:    v_and_b32_e32 v2, s4, v2
234; SI-NEXT:    v_and_b32_e32 v3, s4, v3
235; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
236; SI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v1
237; SI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v2
238; SI-NEXT:    v_cvt_f32_ubyte0_e32 v3, v3
239; SI-NEXT:    s_setpc_b64 s[30:31]
240;
241; VI-LABEL: v_uitofp_v4i8_to_v4f32:
242; VI:       ; %bb.0:
243; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
244; VI-NEXT:    s_movk_i32 s4, 0xff
245; VI-NEXT:    v_mov_b32_e32 v3, s4
246; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
247; VI-NEXT:    v_and_b32_sdwa v2, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
248; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
249; VI-NEXT:    v_and_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
250; VI-NEXT:    v_cvt_f32_ubyte0_e32 v3, v0
251; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
252; VI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v2
253; VI-NEXT:    v_mov_b32_e32 v0, v4
254; VI-NEXT:    s_setpc_b64 s[30:31]
255  %val = bitcast i32 %arg0 to <4 x i8>
256  %cvt = uitofp <4 x i8> %val to <4 x float>
257  ret <4 x float> %cvt
258}
259
260define <4 x float> @v_uitofp_unpack_i32_to_v4f32(i32 %arg0) nounwind {
261; SI-LABEL: v_uitofp_unpack_i32_to_v4f32:
262; SI:       ; %bb.0:
263; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
264; SI-NEXT:    s_movk_i32 s4, 0xff
265; SI-NEXT:    v_and_b32_e32 v1, s4, v0
266; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
267; SI-NEXT:    v_cvt_f32_ubyte0_e32 v4, v1
268; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
269; SI-NEXT:    v_and_b32_e32 v1, s4, v1
270; SI-NEXT:    v_and_b32_e32 v2, s4, v2
271; SI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
272; SI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v1
273; SI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v2
274; SI-NEXT:    v_mov_b32_e32 v0, v4
275; SI-NEXT:    s_setpc_b64 s[30:31]
276;
277; VI-LABEL: v_uitofp_unpack_i32_to_v4f32:
278; VI:       ; %bb.0:
279; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
280; VI-NEXT:    s_movk_i32 s4, 0xff
281; VI-NEXT:    v_mov_b32_e32 v2, s4
282; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
283; VI-NEXT:    v_and_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
284; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
285; VI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
286; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
287; VI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v2
288; VI-NEXT:    v_mov_b32_e32 v0, v4
289; VI-NEXT:    s_setpc_b64 s[30:31]
290  %mask.arg0 = and i32 %arg0, 255
291  %cvt0 = uitofp i32 %mask.arg0 to float
292
293  %lshr.8 = lshr i32 %arg0, 8
294  %mask.lshr.8 = and i32 %lshr.8, 255
295  %cvt1 = uitofp i32 %mask.lshr.8 to float
296
297  %lshr.16 = lshr i32 %arg0, 16
298  %mask.lshr.16 = and i32 %lshr.16, 255
299  %cvt2 = uitofp i32 %mask.lshr.16 to float
300
301  %lshr.24 = lshr i32 %arg0, 24
302  %mask.lshr.24 = and i32 %lshr.24, 255
303  %cvt3 = uitofp i32 %mask.lshr.24 to float
304
305  %ins.0 = insertelement <4 x float> undef, float %cvt0, i32 0
306  %ins.1 = insertelement <4 x float> %ins.0, float %cvt1, i32 1
307  %ins.2 = insertelement <4 x float> %ins.1, float %cvt2, i32 2
308  %ins.3 = insertelement <4 x float> %ins.2, float %cvt3, i32 3
309  ret <4 x float> %ins.3
310}
311
312define half @v_uitofp_i32_to_f16_mask255(i32 %arg0) nounwind {
313; SI-LABEL: v_uitofp_i32_to_f16_mask255:
314; SI:       ; %bb.0:
315; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
316; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
317; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
318; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
319; SI-NEXT:    s_setpc_b64 s[30:31]
320;
321; VI-LABEL: v_uitofp_i32_to_f16_mask255:
322; VI:       ; %bb.0:
323; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
324; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
325; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
326; VI-NEXT:    s_setpc_b64 s[30:31]
327  %masked = and i32 %arg0, 255
328  %cvt = uitofp i32 %masked to half
329  ret half %cvt
330}
331
332define half @v_sitofp_i32_to_f16_mask255(i32 %arg0) nounwind {
333; SI-LABEL: v_sitofp_i32_to_f16_mask255:
334; SI:       ; %bb.0:
335; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
336; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
337; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
338; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
339; SI-NEXT:    s_setpc_b64 s[30:31]
340;
341; VI-LABEL: v_sitofp_i32_to_f16_mask255:
342; VI:       ; %bb.0:
343; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
344; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
345; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
346; VI-NEXT:    s_setpc_b64 s[30:31]
347  %masked = and i32 %arg0, 255
348  %cvt = sitofp i32 %masked to half
349  ret half %cvt
350}
351
352define half @v_uitofp_to_f16_lshr8_mask255(i32 %arg0) nounwind {
353; SI-LABEL: v_uitofp_to_f16_lshr8_mask255:
354; SI:       ; %bb.0:
355; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
356; SI-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
357; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
358; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
359; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
360; SI-NEXT:    s_setpc_b64 s[30:31]
361;
362; VI-LABEL: v_uitofp_to_f16_lshr8_mask255:
363; VI:       ; %bb.0:
364; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
365; VI-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
366; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
367; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
368; VI-NEXT:    s_setpc_b64 s[30:31]
369  %lshr.8 = lshr i32 %arg0, 8
370  %masked = and i32 %lshr.8, 255
371  %cvt = uitofp i32 %masked to half
372  ret half %cvt
373}
374
375define half @v_uitofp_to_f16_lshr16_mask255(i32 %arg0) nounwind {
376; SI-LABEL: v_uitofp_to_f16_lshr16_mask255:
377; SI:       ; %bb.0:
378; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
379; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
380; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
381; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
382; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
383; SI-NEXT:    s_setpc_b64 s[30:31]
384;
385; VI-LABEL: v_uitofp_to_f16_lshr16_mask255:
386; VI:       ; %bb.0:
387; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
388; VI-NEXT:    v_mov_b32_e32 v1, 0xff
389; VI-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
390; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
391; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
392; VI-NEXT:    s_setpc_b64 s[30:31]
393  %lshr.16 = lshr i32 %arg0, 16
394  %masked = and i32 %lshr.16, 255
395  %cvt = uitofp i32 %masked to half
396  ret half %cvt
397}
398
399define half @v_uitofp_to_f16_lshr24_mask255(i32 %arg0) nounwind {
400; GCN-LABEL: v_uitofp_to_f16_lshr24_mask255:
401; GCN:       ; %bb.0:
402; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
403; GCN-NEXT:    v_cvt_f32_ubyte3_e32 v0, v0
404; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
405; GCN-NEXT:    s_setpc_b64 s[30:31]
406  %lshr.16 = lshr i32 %arg0, 24
407  %masked = and i32 %lshr.16, 255
408  %cvt = uitofp i32 %masked to half
409  ret half %cvt
410}
411
412define half @v_uitofp_i8_to_f16(i8 %arg0) nounwind {
413; SI-LABEL: v_uitofp_i8_to_f16:
414; SI:       ; %bb.0:
415; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
416; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
417; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
418; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
419; SI-NEXT:    s_setpc_b64 s[30:31]
420;
421; VI-LABEL: v_uitofp_i8_to_f16:
422; VI:       ; %bb.0:
423; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
424; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
425; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
426; VI-NEXT:    s_setpc_b64 s[30:31]
427  %cvt = uitofp i8 %arg0 to half
428  ret half %cvt
429}
430
431define double @v_uitofp_i32_to_f64_mask255(i32 %arg0) nounwind {
432; GCN-LABEL: v_uitofp_i32_to_f64_mask255:
433; GCN:       ; %bb.0:
434; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
435; GCN-NEXT:    v_and_b32_e32 v0, 0xff, v0
436; GCN-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
437; GCN-NEXT:    s_setpc_b64 s[30:31]
438  %masked = and i32 %arg0, 255
439  %cvt = uitofp i32 %masked to double
440  ret double %cvt
441}
442
443define double @v_uitofp_to_f64_lshr8_mask255(i32 %arg0) nounwind {
444; GCN-LABEL: v_uitofp_to_f64_lshr8_mask255:
445; GCN:       ; %bb.0:
446; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
447; GCN-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
448; GCN-NEXT:    v_and_b32_e32 v0, 0xff, v0
449; GCN-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
450; GCN-NEXT:    s_setpc_b64 s[30:31]
451  %lshr.8 = lshr i32 %arg0, 8
452  %masked = and i32 %lshr.8, 255
453  %cvt = uitofp i32 %masked to double
454  ret double %cvt
455}
456
457define double @v_uitofp_to_f64_lshr16_mask255(i32 %arg0) nounwind {
458; SI-LABEL: v_uitofp_to_f64_lshr16_mask255:
459; SI:       ; %bb.0:
460; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
461; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
462; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
463; SI-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
464; SI-NEXT:    s_setpc_b64 s[30:31]
465;
466; VI-LABEL: v_uitofp_to_f64_lshr16_mask255:
467; VI:       ; %bb.0:
468; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
469; VI-NEXT:    v_mov_b32_e32 v1, 0xff
470; VI-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
471; VI-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
472; VI-NEXT:    s_setpc_b64 s[30:31]
473  %lshr.16 = lshr i32 %arg0, 16
474  %masked = and i32 %lshr.16, 255
475  %cvt = uitofp i32 %masked to double
476  ret double %cvt
477}
478
479define double @v_uitofp_to_f64_lshr24_mask255(i32 %arg0) nounwind {
480; GCN-LABEL: v_uitofp_to_f64_lshr24_mask255:
481; GCN:       ; %bb.0:
482; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
483; GCN-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
484; GCN-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
485; GCN-NEXT:    s_setpc_b64 s[30:31]
486  %lshr.16 = lshr i32 %arg0, 24
487  %masked = and i32 %lshr.16, 255
488  %cvt = uitofp i32 %masked to double
489  ret double %cvt
490}
491
492define double @v_uitofp_i8_to_f64(i8 %arg0) nounwind {
493; GCN-LABEL: v_uitofp_i8_to_f64:
494; GCN:       ; %bb.0:
495; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
496; GCN-NEXT:    v_and_b32_e32 v0, 0xff, v0
497; GCN-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
498; GCN-NEXT:    s_setpc_b64 s[30:31]
499  %cvt = uitofp i8 %arg0 to double
500  ret double %cvt
501}
502
503define amdgpu_kernel void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
504; SI-LABEL: load_i8_to_f32:
505; SI:       ; %bb.0:
506; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
507; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
508; SI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
509; SI-NEXT:    s_mov_b32 s2, 0
510; SI-NEXT:    s_mov_b32 s3, 0xf000
511; SI-NEXT:    s_waitcnt lgkmcnt(0)
512; SI-NEXT:    buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64
513; SI-NEXT:    s_mov_b32 s2, -1
514; SI-NEXT:    s_mov_b64 s[6:7], s[2:3]
515; SI-NEXT:    s_waitcnt vmcnt(0)
516; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
517; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
518; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
519; SI-NEXT:    s_endpgm
520;
521; VI-LABEL: load_i8_to_f32:
522; VI:       ; %bb.0:
523; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
524; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
525; VI-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
526; VI-NEXT:    s_waitcnt lgkmcnt(0)
527; VI-NEXT:    v_mov_b32_e32 v2, s1
528; VI-NEXT:    v_mov_b32_e32 v1, s0
529; VI-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
530; VI-NEXT:    v_addc_u32_e32 v1, vcc, v2, v3, vcc
531; VI-NEXT:    flat_load_ubyte v0, v[0:1]
532; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
533; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
534; VI-NEXT:    v_mov_b32_e32 v0, s2
535; VI-NEXT:    v_mov_b32_e32 v1, s3
536; VI-NEXT:    flat_store_dword v[0:1], v2
537; VI-NEXT:    s_endpgm
538  %tid = call i32 @llvm.amdgcn.workitem.id.x()
539  %gep = getelementptr i8, i8 addrspace(1)* %in, i32 %tid
540  %load = load i8, i8 addrspace(1)* %gep, align 1
541  %cvt = uitofp i8 %load to float
542  store float %cvt, float addrspace(1)* %out, align 4
543  ret void
544}
545
546; FIXME:
547; define amdgpu_kernel void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> addrspace(1)* noalias %in) nounwind {
548;   %tid = call i32 @llvm.amdgcn.workitem.id.x()
549;   %gep = getelementptr <2 x i8>, <2 x i8> addrspace(1)* %in, i32 %tid
550;   %load = load <2 x i8>, <2 x i8> addrspace(1)* %gep, align 2
551;   %cvt = uitofp <2 x i8> %load to <2 x float>
552;   store <2 x float> %cvt, <2 x float> addrspace(1)* %out, align 16
553;   ret void
554; }
555
556; FIXME:
557; define amdgpu_kernel void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind {
558;   %tid = call i32 @llvm.amdgcn.workitem.id.x()
559;   %gep = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %in, i32 %tid
560;   %load = load <3 x i8>, <3 x i8> addrspace(1)* %gep, align 4
561;   %cvt = uitofp <3 x i8> %load to <3 x float>
562;   store <3 x float> %cvt, <3 x float> addrspace(1)* %out, align 16
563;   ret void
564; }
565
566; define amdgpu_kernel void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
567;   %tid = call i32 @llvm.amdgcn.workitem.id.x()
568;   %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid
569;   %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
570;   %cvt = uitofp <4 x i8> %load to <4 x float>
571;   store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
572;   ret void
573; }
574
575; This should not be adding instructions to shift into the correct
576; position in the word for the component.
577
578; FIXME: Packing bytes
579define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
580; SI-LABEL: load_v4i8_to_v4f32_unaligned:
581; SI:       ; %bb.0:
582; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
583; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
584; SI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
585; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 2
586; SI-NEXT:    s_mov_b32 s2, 0
587; SI-NEXT:    s_mov_b32 s3, 0xf000
588; SI-NEXT:    s_waitcnt lgkmcnt(0)
589; SI-NEXT:    buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64
590; SI-NEXT:    buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:1
591; SI-NEXT:    buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 offset:2
592; SI-NEXT:    buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:3
593; SI-NEXT:    s_movk_i32 s6, 0xff
594; SI-NEXT:    s_mov_b32 s2, -1
595; SI-NEXT:    s_waitcnt vmcnt(3)
596; SI-NEXT:    v_and_b32_e32 v1, s6, v2
597; SI-NEXT:    s_waitcnt vmcnt(2)
598; SI-NEXT:    v_and_b32_e32 v2, s6, v3
599; SI-NEXT:    s_waitcnt vmcnt(1)
600; SI-NEXT:    v_and_b32_e32 v3, s6, v4
601; SI-NEXT:    s_waitcnt vmcnt(0)
602; SI-NEXT:    v_and_b32_e32 v4, s6, v0
603; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v1
604; SI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v2
605; SI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v3
606; SI-NEXT:    v_cvt_f32_ubyte0_e32 v3, v4
607; SI-NEXT:    s_mov_b64 s[6:7], s[2:3]
608; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
609; SI-NEXT:    s_endpgm
610;
611; VI-LABEL: load_v4i8_to_v4f32_unaligned:
612; VI:       ; %bb.0:
613; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
614; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
615; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
616; VI-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
617; VI-NEXT:    s_waitcnt lgkmcnt(0)
618; VI-NEXT:    v_mov_b32_e32 v3, s1
619; VI-NEXT:    v_mov_b32_e32 v2, s0
620; VI-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
621; VI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
622; VI-NEXT:    v_add_u32_e32 v2, vcc, 1, v0
623; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
624; VI-NEXT:    v_add_u32_e32 v4, vcc, 2, v0
625; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
626; VI-NEXT:    v_add_u32_e32 v6, vcc, 3, v0
627; VI-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
628; VI-NEXT:    flat_load_ubyte v0, v[0:1]
629; VI-NEXT:    flat_load_ubyte v1, v[2:3]
630; VI-NEXT:    flat_load_ubyte v2, v[4:5]
631; VI-NEXT:    flat_load_ubyte v3, v[6:7]
632; VI-NEXT:    v_mov_b32_e32 v5, s3
633; VI-NEXT:    v_mov_b32_e32 v4, s2
634; VI-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
635; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
636; VI-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
637; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
638; VI-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
639; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
640; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
641; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
642; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
643; VI-NEXT:    s_endpgm
644  %tid = call i32 @llvm.amdgcn.workitem.id.x()
645  %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid
646  %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 1
647  %cvt = uitofp <4 x i8> %load to <4 x float>
648  store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
649  ret void
650}
651
652; FIXME: Need to handle non-uniform case for function below (load without gep).
653; Instructions still emitted to repack bytes for add use.
654; define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind {
655;   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
656;   %in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
657;   %load = load <4 x i8>, <4 x i8> addrspace(1)* %in.ptr, align 4
658;   %cvt = uitofp <4 x i8> %load to <4 x float>
659;   store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
660;   %add = add <4 x i8> %load, <i8 9, i8 9, i8 9, i8 9> ; Second use of %load
661;   store <4 x i8> %add, <4 x i8> addrspace(1)* %out2, align 4
662;   ret void
663; }
664
665; Make sure this doesn't crash.
666; FIXME:
667; define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8> addrspace(1)* noalias %in) nounwind {
668;   %tid = call i32 @llvm.amdgcn.workitem.id.x()
669;   %gep = getelementptr <7 x i8>, <7 x i8> addrspace(1)* %in, i32 %tid
670;   %load = load <7 x i8>, <7 x i8> addrspace(1)* %gep, align 1
671;   %cvt = uitofp <7 x i8> %load to <7 x float>
672;   store <7 x float> %cvt, <7 x float> addrspace(1)* %out, align 16
673;   ret void
674; }
675
676; FIXME
677; define amdgpu_kernel void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind {
678;   %tid = call i32 @llvm.amdgcn.workitem.id.x()
679;   %gep = getelementptr <8 x i8>, <8 x i8> addrspace(1)* %in, i32 %tid
680;   %load = load <8 x i8>, <8 x i8> addrspace(1)* %gep, align 8
681;   %cvt = uitofp <8 x i8> %load to <8 x float>
682;   store <8 x float> %cvt, <8 x float> addrspace(1)* %out, align 16
683;   ret void
684; }
685
686define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
687; SI-LABEL: i8_zext_inreg_i32_to_f32:
688; SI:       ; %bb.0:
689; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
690; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
691; SI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
692; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 2
693; SI-NEXT:    s_mov_b32 s2, 0
694; SI-NEXT:    s_mov_b32 s3, 0xf000
695; SI-NEXT:    s_waitcnt lgkmcnt(0)
696; SI-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
697; SI-NEXT:    s_mov_b32 s2, -1
698; SI-NEXT:    s_mov_b64 s[6:7], s[2:3]
699; SI-NEXT:    s_waitcnt vmcnt(0)
700; SI-NEXT:    v_add_i32_e32 v0, vcc, 2, v0
701; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
702; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
703; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
704; SI-NEXT:    s_endpgm
705;
706; VI-LABEL: i8_zext_inreg_i32_to_f32:
707; VI:       ; %bb.0:
708; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
709; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
710; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
711; VI-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
712; VI-NEXT:    s_waitcnt lgkmcnt(0)
713; VI-NEXT:    v_mov_b32_e32 v3, s1
714; VI-NEXT:    v_mov_b32_e32 v2, s0
715; VI-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
716; VI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
717; VI-NEXT:    flat_load_dword v0, v[0:1]
718; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
719; VI-NEXT:    v_add_u32_e32 v0, vcc, 2, v0
720; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
721; VI-NEXT:    v_mov_b32_e32 v0, s2
722; VI-NEXT:    v_mov_b32_e32 v1, s3
723; VI-NEXT:    flat_store_dword v[0:1], v2
724; VI-NEXT:    s_endpgm
725  %tid = call i32 @llvm.amdgcn.workitem.id.x()
726  %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
727  %load = load i32, i32 addrspace(1)* %gep, align 4
728  %add = add i32 %load, 2
729  %inreg = and i32 %add, 255
730  %cvt = uitofp i32 %inreg to float
731  store float %cvt, float addrspace(1)* %out, align 4
732  ret void
733}
734
735define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
736; SI-LABEL: i8_zext_inreg_hi1_to_f32:
737; SI:       ; %bb.0:
738; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
739; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
740; SI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
741; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 2
742; SI-NEXT:    s_mov_b32 s2, 0
743; SI-NEXT:    s_mov_b32 s3, 0xf000
744; SI-NEXT:    s_waitcnt lgkmcnt(0)
745; SI-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
746; SI-NEXT:    s_mov_b32 s2, -1
747; SI-NEXT:    s_mov_b64 s[6:7], s[2:3]
748; SI-NEXT:    s_waitcnt vmcnt(0)
749; SI-NEXT:    v_and_b32_e32 v0, 0xff00, v0
750; SI-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
751; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
752; SI-NEXT:    s_endpgm
753;
754; VI-LABEL: i8_zext_inreg_hi1_to_f32:
755; VI:       ; %bb.0:
756; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
757; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
758; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
759; VI-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
760; VI-NEXT:    s_waitcnt lgkmcnt(0)
761; VI-NEXT:    v_mov_b32_e32 v3, s1
762; VI-NEXT:    v_mov_b32_e32 v2, s0
763; VI-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
764; VI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
765; VI-NEXT:    flat_load_dword v0, v[0:1]
766; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
767; VI-NEXT:    v_and_b32_e32 v0, 0xff00, v0
768; VI-NEXT:    v_cvt_f32_ubyte1_e32 v2, v0
769; VI-NEXT:    v_mov_b32_e32 v0, s2
770; VI-NEXT:    v_mov_b32_e32 v1, s3
771; VI-NEXT:    flat_store_dword v[0:1], v2
772; VI-NEXT:    s_endpgm
773  %tid = call i32 @llvm.amdgcn.workitem.id.x()
774  %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
775  %load = load i32, i32 addrspace(1)* %gep, align 4
776  %inreg = and i32 %load, 65280
777  %shr = lshr i32 %inreg, 8
778  %cvt = uitofp i32 %shr to float
779  store float %cvt, float addrspace(1)* %out, align 4
780  ret void
781}
782
783; We don't get these ones because of the zext, but instcombine removes
784; them so it shouldn't really matter.
785define amdgpu_kernel void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
786; SI-LABEL: i8_zext_i32_to_f32:
787; SI:       ; %bb.0:
788; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
789; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
790; SI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
791; SI-NEXT:    s_mov_b32 s2, 0
792; SI-NEXT:    s_mov_b32 s3, 0xf000
793; SI-NEXT:    s_waitcnt lgkmcnt(0)
794; SI-NEXT:    buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64
795; SI-NEXT:    s_mov_b32 s2, -1
796; SI-NEXT:    s_mov_b64 s[6:7], s[2:3]
797; SI-NEXT:    s_waitcnt vmcnt(0)
798; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
799; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
800; SI-NEXT:    s_endpgm
801;
802; VI-LABEL: i8_zext_i32_to_f32:
803; VI:       ; %bb.0:
804; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
805; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
806; VI-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
807; VI-NEXT:    s_waitcnt lgkmcnt(0)
808; VI-NEXT:    v_mov_b32_e32 v2, s1
809; VI-NEXT:    v_mov_b32_e32 v1, s0
810; VI-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
811; VI-NEXT:    v_addc_u32_e32 v1, vcc, v2, v3, vcc
812; VI-NEXT:    flat_load_ubyte v0, v[0:1]
813; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
814; VI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v0
815; VI-NEXT:    v_mov_b32_e32 v0, s2
816; VI-NEXT:    v_mov_b32_e32 v1, s3
817; VI-NEXT:    flat_store_dword v[0:1], v2
818; VI-NEXT:    s_endpgm
819  %tid = call i32 @llvm.amdgcn.workitem.id.x()
820  %gep = getelementptr i8, i8 addrspace(1)* %in, i32 %tid
821  %load = load i8, i8 addrspace(1)* %gep, align 1
822  %ext = zext i8 %load to i32
823  %cvt = uitofp i32 %ext to float
824  store float %cvt, float addrspace(1)* %out, align 4
825  ret void
826}
827
828define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
829; SI-LABEL: v4i8_zext_v4i32_to_v4f32:
830; SI:       ; %bb.0:
831; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
832; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
833; SI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
834; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 2
835; SI-NEXT:    s_mov_b32 s2, 0
836; SI-NEXT:    s_mov_b32 s3, 0xf000
837; SI-NEXT:    s_waitcnt lgkmcnt(0)
838; SI-NEXT:    buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64
839; SI-NEXT:    buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:1
840; SI-NEXT:    buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 offset:2
841; SI-NEXT:    buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:3
842; SI-NEXT:    s_movk_i32 s6, 0xff
843; SI-NEXT:    s_mov_b32 s2, -1
844; SI-NEXT:    s_waitcnt vmcnt(3)
845; SI-NEXT:    v_and_b32_e32 v1, s6, v2
846; SI-NEXT:    s_waitcnt vmcnt(2)
847; SI-NEXT:    v_and_b32_e32 v2, s6, v3
848; SI-NEXT:    s_waitcnt vmcnt(1)
849; SI-NEXT:    v_and_b32_e32 v3, s6, v4
850; SI-NEXT:    s_waitcnt vmcnt(0)
851; SI-NEXT:    v_and_b32_e32 v4, s6, v0
852; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v1
853; SI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v2
854; SI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v3
855; SI-NEXT:    v_cvt_f32_ubyte0_e32 v3, v4
856; SI-NEXT:    s_mov_b64 s[6:7], s[2:3]
857; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
858; SI-NEXT:    s_endpgm
859;
860; VI-LABEL: v4i8_zext_v4i32_to_v4f32:
861; VI:       ; %bb.0:
862; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
863; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
864; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
865; VI-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
866; VI-NEXT:    s_waitcnt lgkmcnt(0)
867; VI-NEXT:    v_mov_b32_e32 v3, s1
868; VI-NEXT:    v_mov_b32_e32 v2, s0
869; VI-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
870; VI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
871; VI-NEXT:    v_add_u32_e32 v2, vcc, 1, v0
872; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
873; VI-NEXT:    v_add_u32_e32 v4, vcc, 2, v0
874; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
875; VI-NEXT:    v_add_u32_e32 v6, vcc, 3, v0
876; VI-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
877; VI-NEXT:    flat_load_ubyte v0, v[0:1]
878; VI-NEXT:    flat_load_ubyte v1, v[2:3]
879; VI-NEXT:    flat_load_ubyte v2, v[4:5]
880; VI-NEXT:    flat_load_ubyte v3, v[6:7]
881; VI-NEXT:    v_mov_b32_e32 v5, s3
882; VI-NEXT:    v_mov_b32_e32 v4, s2
883; VI-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
884; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
885; VI-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
886; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
887; VI-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
888; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
889; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
890; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
891; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
892; VI-NEXT:    s_endpgm
893  %tid = call i32 @llvm.amdgcn.workitem.id.x()
894  %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid
895  %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 1
896  %ext = zext <4 x i8> %load to <4 x i32>
897  %cvt = uitofp <4 x i32> %ext to <4 x float>
898  store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
899  ret void
900}
901
902define amdgpu_kernel void @extract_byte0_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
903; SI-LABEL: extract_byte0_to_f32:
904; SI:       ; %bb.0:
905; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
906; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
907; SI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
908; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 2
909; SI-NEXT:    s_mov_b32 s2, 0
910; SI-NEXT:    s_mov_b32 s3, 0xf000
911; SI-NEXT:    s_waitcnt lgkmcnt(0)
912; SI-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
913; SI-NEXT:    s_mov_b32 s2, -1
914; SI-NEXT:    s_mov_b64 s[6:7], s[2:3]
915; SI-NEXT:    s_waitcnt vmcnt(0)
916; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
917; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
918; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
919; SI-NEXT:    s_endpgm
920;
921; VI-LABEL: extract_byte0_to_f32:
922; VI:       ; %bb.0:
923; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
924; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
925; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
926; VI-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
927; VI-NEXT:    s_waitcnt lgkmcnt(0)
928; VI-NEXT:    v_mov_b32_e32 v3, s1
929; VI-NEXT:    v_mov_b32_e32 v2, s0
930; VI-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
931; VI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
932; VI-NEXT:    flat_load_dword v0, v[0:1]
933; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
934; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
935; VI-NEXT:    v_mov_b32_e32 v0, s2
936; VI-NEXT:    v_mov_b32_e32 v1, s3
937; VI-NEXT:    flat_store_dword v[0:1], v2
938; VI-NEXT:    s_endpgm
939  %tid = call i32 @llvm.amdgcn.workitem.id.x()
940  %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
941  %val = load i32, i32 addrspace(1)* %gep
942  %and = and i32 %val, 255
943  %cvt = uitofp i32 %and to float
944  store float %cvt, float addrspace(1)* %out
945  ret void
946}
947
948define amdgpu_kernel void @extract_byte1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
949; SI-LABEL: extract_byte1_to_f32:
950; SI:       ; %bb.0:
951; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
952; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
953; SI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
954; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 2
955; SI-NEXT:    s_mov_b32 s2, 0
956; SI-NEXT:    s_mov_b32 s3, 0xf000
957; SI-NEXT:    s_waitcnt lgkmcnt(0)
958; SI-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
959; SI-NEXT:    s_mov_b32 s2, -1
960; SI-NEXT:    s_mov_b64 s[6:7], s[2:3]
961; SI-NEXT:    s_waitcnt vmcnt(0)
962; SI-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
963; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
964; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
965; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
966; SI-NEXT:    s_endpgm
967;
968; VI-LABEL: extract_byte1_to_f32:
969; VI:       ; %bb.0:
970; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
971; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
972; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
973; VI-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
974; VI-NEXT:    s_waitcnt lgkmcnt(0)
975; VI-NEXT:    v_mov_b32_e32 v3, s1
976; VI-NEXT:    v_mov_b32_e32 v2, s0
977; VI-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
978; VI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
979; VI-NEXT:    flat_load_dword v0, v[0:1]
980; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
981; VI-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
982; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
983; VI-NEXT:    v_mov_b32_e32 v0, s2
984; VI-NEXT:    v_mov_b32_e32 v1, s3
985; VI-NEXT:    flat_store_dword v[0:1], v2
986; VI-NEXT:    s_endpgm
987  %tid = call i32 @llvm.amdgcn.workitem.id.x()
988  %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
989  %val = load i32, i32 addrspace(1)* %gep
990  %srl = lshr i32 %val, 8
991  %and = and i32 %srl, 255
992  %cvt = uitofp i32 %and to float
993  store float %cvt, float addrspace(1)* %out
994  ret void
995}
996
997define amdgpu_kernel void @extract_byte2_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
998; SI-LABEL: extract_byte2_to_f32:
999; SI:       ; %bb.0:
1000; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1001; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1002; SI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
1003; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 2
1004; SI-NEXT:    s_mov_b32 s2, 0
1005; SI-NEXT:    s_mov_b32 s3, 0xf000
1006; SI-NEXT:    s_waitcnt lgkmcnt(0)
1007; SI-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
1008; SI-NEXT:    s_mov_b32 s2, -1
1009; SI-NEXT:    s_mov_b64 s[6:7], s[2:3]
1010; SI-NEXT:    s_waitcnt vmcnt(0)
1011; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1012; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
1013; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
1014; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1015; SI-NEXT:    s_endpgm
1016;
1017; VI-LABEL: extract_byte2_to_f32:
1018; VI:       ; %bb.0:
1019; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1020; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
1021; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
1022; VI-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
1023; VI-NEXT:    s_waitcnt lgkmcnt(0)
1024; VI-NEXT:    v_mov_b32_e32 v3, s1
1025; VI-NEXT:    v_mov_b32_e32 v2, s0
1026; VI-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
1027; VI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
1028; VI-NEXT:    flat_load_dword v0, v[0:1]
1029; VI-NEXT:    v_mov_b32_e32 v1, 0xff
1030; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1031; VI-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1032; VI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v0
1033; VI-NEXT:    v_mov_b32_e32 v0, s2
1034; VI-NEXT:    v_mov_b32_e32 v1, s3
1035; VI-NEXT:    flat_store_dword v[0:1], v2
1036; VI-NEXT:    s_endpgm
1037  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1038  %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
1039  %val = load i32, i32 addrspace(1)* %gep
1040  %srl = lshr i32 %val, 16
1041  %and = and i32 %srl, 255
1042  %cvt = uitofp i32 %and to float
1043  store float %cvt, float addrspace(1)* %out
1044  ret void
1045}
1046
1047define amdgpu_kernel void @extract_byte3_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
1048; SI-LABEL: extract_byte3_to_f32:
1049; SI:       ; %bb.0:
1050; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1051; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1052; SI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
1053; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 2
1054; SI-NEXT:    s_mov_b32 s2, 0
1055; SI-NEXT:    s_mov_b32 s3, 0xf000
1056; SI-NEXT:    s_waitcnt lgkmcnt(0)
1057; SI-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
1058; SI-NEXT:    s_mov_b32 s2, -1
1059; SI-NEXT:    s_mov_b64 s[6:7], s[2:3]
1060; SI-NEXT:    s_waitcnt vmcnt(0)
1061; SI-NEXT:    v_cvt_f32_ubyte3_e32 v0, v0
1062; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1063; SI-NEXT:    s_endpgm
1064;
1065; VI-LABEL: extract_byte3_to_f32:
1066; VI:       ; %bb.0:
1067; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1068; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
1069; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
1070; VI-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
1071; VI-NEXT:    s_waitcnt lgkmcnt(0)
1072; VI-NEXT:    v_mov_b32_e32 v3, s1
1073; VI-NEXT:    v_mov_b32_e32 v2, s0
1074; VI-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
1075; VI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
1076; VI-NEXT:    flat_load_dword v0, v[0:1]
1077; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1078; VI-NEXT:    v_cvt_f32_ubyte3_e32 v2, v0
1079; VI-NEXT:    v_mov_b32_e32 v0, s2
1080; VI-NEXT:    v_mov_b32_e32 v1, s3
1081; VI-NEXT:    flat_store_dword v[0:1], v2
1082; VI-NEXT:    s_endpgm
1083  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1084  %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
1085  %val = load i32, i32 addrspace(1)* %gep
1086  %srl = lshr i32 %val, 24
1087  %and = and i32 %srl, 255
1088  %cvt = uitofp i32 %and to float
1089  store float %cvt, float addrspace(1)* %out
1090  ret void
1091}
1092
1093define amdgpu_kernel void @cvt_ubyte0_or_multiuse(i32 addrspace(1)* %in, float addrspace(1)* %out) {
1094; SI-LABEL: cvt_ubyte0_or_multiuse:
1095; SI:       ; %bb.0: ; %bb
1096; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1097; SI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
1098; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 2
1099; SI-NEXT:    s_mov_b32 s6, 0
1100; SI-NEXT:    s_mov_b32 s7, 0xf000
1101; SI-NEXT:    s_waitcnt lgkmcnt(0)
1102; SI-NEXT:    s_mov_b64 s[4:5], s[0:1]
1103; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1104; SI-NEXT:    s_mov_b32 s6, -1
1105; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
1106; SI-NEXT:    s_waitcnt vmcnt(0)
1107; SI-NEXT:    v_or_b32_e32 v0, 0x80000001, v0
1108; SI-NEXT:    v_and_b32_e32 v1, 0xff, v0
1109; SI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v1
1110; SI-NEXT:    v_add_f32_e32 v0, v0, v1
1111; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1112; SI-NEXT:    s_endpgm
1113;
1114; VI-LABEL: cvt_ubyte0_or_multiuse:
1115; VI:       ; %bb.0: ; %bb
1116; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1117; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
1118; VI-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
1119; VI-NEXT:    s_waitcnt lgkmcnt(0)
1120; VI-NEXT:    v_mov_b32_e32 v3, s1
1121; VI-NEXT:    v_mov_b32_e32 v2, s0
1122; VI-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
1123; VI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
1124; VI-NEXT:    flat_load_dword v0, v[0:1]
1125; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1126; VI-NEXT:    v_or_b32_e32 v0, 0x80000001, v0
1127; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
1128; VI-NEXT:    v_add_f32_e32 v2, v0, v1
1129; VI-NEXT:    v_mov_b32_e32 v0, s2
1130; VI-NEXT:    v_mov_b32_e32 v1, s3
1131; VI-NEXT:    flat_store_dword v[0:1], v2
1132; VI-NEXT:    s_endpgm
1133bb:
1134  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
1135  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %lid
1136  %load = load i32, i32 addrspace(1)* %gep
1137  %or = or i32 %load, -2147483647
1138  %and = and i32 %or, 255
1139  %uitofp = uitofp i32 %and to float
1140  %cast = bitcast i32 %or to float
1141  %add = fadd float %cast, %uitofp
1142  store float %add, float addrspace(1)* %out
1143  ret void
1144}
1145
1146define float @v_test_sitofp_i64_byte_to_f32(i64 %arg0) {
1147; SI-LABEL: v_test_sitofp_i64_byte_to_f32:
1148; SI:       ; %bb.0:
1149; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1150; SI-NEXT:    s_movk_i32 s6, 0xff
1151; SI-NEXT:    v_and_b32_e32 v2, s6, v0
1152; SI-NEXT:    v_add_i32_e32 v2, vcc, 0, v2
1153; SI-NEXT:    v_ffbh_u32_e32 v4, v2
1154; SI-NEXT:    v_addc_u32_e64 v3, s[4:5], 0, 0, vcc
1155; SI-NEXT:    v_add_i32_e32 v4, vcc, 32, v4
1156; SI-NEXT:    v_ffbh_u32_e32 v5, v3
1157; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
1158; SI-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
1159; SI-NEXT:    v_mov_b32_e32 v5, 0xbe
1160; SI-NEXT:    v_sub_i32_e32 v6, vcc, v5, v4
1161; SI-NEXT:    v_lshl_b64 v[4:5], v[2:3], v4
1162; SI-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
1163; SI-NEXT:    v_and_b32_e32 v3, 0x7fffffff, v5
1164; SI-NEXT:    v_cndmask_b32_e32 v2, 0, v6, vcc
1165; SI-NEXT:    v_and_b32_e32 v5, s6, v3
1166; SI-NEXT:    v_lshrrev_b32_e32 v3, 8, v3
1167; SI-NEXT:    v_lshlrev_b32_e32 v2, 23, v2
1168; SI-NEXT:    s_mov_b32 s4, 0
1169; SI-NEXT:    s_movk_i32 s5, 0x80
1170; SI-NEXT:    v_or_b32_e32 v2, v2, v3
1171; SI-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[4:5]
1172; SI-NEXT:    v_and_b32_e32 v3, 1, v2
1173; SI-NEXT:    v_cndmask_b32_e32 v3, 0, v3, vcc
1174; SI-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[4:5]
1175; SI-NEXT:    v_mov_b32_e32 v0, 0
1176; SI-NEXT:    v_cndmask_b32_e64 v3, v3, 1, vcc
1177; SI-NEXT:    v_mov_b32_e32 v1, v0
1178; SI-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
1179; SI-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
1180; SI-NEXT:    v_cndmask_b32_e64 v0, v2, -v2, vcc
1181; SI-NEXT:    s_setpc_b64 s[30:31]
1182;
1183; VI-LABEL: v_test_sitofp_i64_byte_to_f32:
1184; VI:       ; %bb.0:
1185; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1186; VI-NEXT:    s_movk_i32 s6, 0xff
1187; VI-NEXT:    v_and_b32_e32 v2, s6, v0
1188; VI-NEXT:    v_add_u32_e32 v2, vcc, 0, v2
1189; VI-NEXT:    v_ffbh_u32_e32 v4, v2
1190; VI-NEXT:    v_addc_u32_e64 v3, s[4:5], 0, 0, vcc
1191; VI-NEXT:    v_add_u32_e32 v4, vcc, 32, v4
1192; VI-NEXT:    v_ffbh_u32_e32 v5, v3
1193; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
1194; VI-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
1195; VI-NEXT:    v_mov_b32_e32 v5, 0xbe
1196; VI-NEXT:    v_sub_u32_e32 v6, vcc, v5, v4
1197; VI-NEXT:    v_lshlrev_b64 v[4:5], v4, v[2:3]
1198; VI-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
1199; VI-NEXT:    v_and_b32_e32 v3, 0x7fffffff, v5
1200; VI-NEXT:    v_cndmask_b32_e32 v2, 0, v6, vcc
1201; VI-NEXT:    v_and_b32_e32 v5, s6, v3
1202; VI-NEXT:    v_lshrrev_b32_e32 v3, 8, v3
1203; VI-NEXT:    v_lshlrev_b32_e32 v2, 23, v2
1204; VI-NEXT:    s_mov_b32 s4, 0
1205; VI-NEXT:    s_movk_i32 s5, 0x80
1206; VI-NEXT:    v_or_b32_e32 v2, v2, v3
1207; VI-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[4:5]
1208; VI-NEXT:    v_and_b32_e32 v3, 1, v2
1209; VI-NEXT:    v_cndmask_b32_e32 v3, 0, v3, vcc
1210; VI-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[4:5]
1211; VI-NEXT:    v_mov_b32_e32 v0, 0
1212; VI-NEXT:    v_cndmask_b32_e64 v3, v3, 1, vcc
1213; VI-NEXT:    v_mov_b32_e32 v1, v0
1214; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
1215; VI-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
1216; VI-NEXT:    v_cndmask_b32_e64 v0, v2, -v2, vcc
1217; VI-NEXT:    s_setpc_b64 s[30:31]
1218  %masked = and i64 %arg0, 255
1219  %itofp = sitofp i64 %masked to float
1220  ret float %itofp
1221}
1222
1223define float @v_test_uitofp_i64_byte_to_f32(i64 %arg0) {
1224; SI-LABEL: v_test_uitofp_i64_byte_to_f32:
1225; SI:       ; %bb.0:
1226; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1227; SI-NEXT:    s_movk_i32 s4, 0xff
1228; SI-NEXT:    v_and_b32_e32 v0, s4, v0
1229; SI-NEXT:    v_ffbh_u32_e32 v2, v0
1230; SI-NEXT:    v_add_i32_e32 v2, vcc, 32, v2
1231; SI-NEXT:    v_ffbh_u32_e32 v3, 0
1232; SI-NEXT:    v_cmp_eq_u32_e64 vcc, 0, 0
1233; SI-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
1234; SI-NEXT:    v_mov_b32_e32 v3, 0xbe
1235; SI-NEXT:    v_mov_b32_e32 v1, 0
1236; SI-NEXT:    v_sub_i32_e32 v4, vcc, v3, v2
1237; SI-NEXT:    v_lshl_b64 v[2:3], v[0:1], v2
1238; SI-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
1239; SI-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v3
1240; SI-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc
1241; SI-NEXT:    v_and_b32_e32 v3, s4, v1
1242; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
1243; SI-NEXT:    v_lshlrev_b32_e32 v0, 23, v0
1244; SI-NEXT:    s_mov_b32 s4, 0
1245; SI-NEXT:    s_movk_i32 s5, 0x80
1246; SI-NEXT:    v_or_b32_e32 v0, v0, v1
1247; SI-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[2:3]
1248; SI-NEXT:    v_and_b32_e32 v1, 1, v0
1249; SI-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
1250; SI-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
1251; SI-NEXT:    v_cndmask_b32_e64 v1, v1, 1, vcc
1252; SI-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
1253; SI-NEXT:    s_setpc_b64 s[30:31]
1254;
1255; VI-LABEL: v_test_uitofp_i64_byte_to_f32:
1256; VI:       ; %bb.0:
1257; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1258; VI-NEXT:    s_movk_i32 s4, 0xff
1259; VI-NEXT:    v_and_b32_e32 v0, s4, v0
1260; VI-NEXT:    v_ffbh_u32_e32 v2, v0
1261; VI-NEXT:    v_add_u32_e32 v2, vcc, 32, v2
1262; VI-NEXT:    v_ffbh_u32_e32 v3, 0
1263; VI-NEXT:    v_cmp_eq_u32_e64 vcc, 0, 0
1264; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
1265; VI-NEXT:    v_mov_b32_e32 v3, 0xbe
1266; VI-NEXT:    v_mov_b32_e32 v1, 0
1267; VI-NEXT:    v_sub_u32_e32 v4, vcc, v3, v2
1268; VI-NEXT:    v_lshlrev_b64 v[2:3], v2, v[0:1]
1269; VI-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
1270; VI-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v3
1271; VI-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc
1272; VI-NEXT:    v_and_b32_e32 v3, s4, v1
1273; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
1274; VI-NEXT:    v_lshlrev_b32_e32 v0, 23, v0
1275; VI-NEXT:    s_mov_b32 s4, 0
1276; VI-NEXT:    s_movk_i32 s5, 0x80
1277; VI-NEXT:    v_or_b32_e32 v0, v0, v1
1278; VI-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[2:3]
1279; VI-NEXT:    v_and_b32_e32 v1, 1, v0
1280; VI-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
1281; VI-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
1282; VI-NEXT:    v_cndmask_b32_e64 v1, v1, 1, vcc
1283; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
1284; VI-NEXT:    s_setpc_b64 s[30:31]
1285  %masked = and i64 %arg0, 255
1286  %itofp = uitofp i64 %masked to float
1287  ret float %itofp
1288}
1289
1290define float @v_test_sitofp_i16_byte_to_f32(i16 %arg0) {
1291; SI-LABEL: v_test_sitofp_i16_byte_to_f32:
1292; SI:       ; %bb.0:
1293; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1294; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
1295; SI-NEXT:    v_bfe_i32 v0, v0, 0, 16
1296; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
1297; SI-NEXT:    s_setpc_b64 s[30:31]
1298;
1299; VI-LABEL: v_test_sitofp_i16_byte_to_f32:
1300; VI:       ; %bb.0:
1301; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1302; VI-NEXT:    v_and_b32_e32 v0, 0xff, v0
1303; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
1304; VI-NEXT:    s_setpc_b64 s[30:31]
1305  %masked = and i16 %arg0, 255
1306  %itofp = sitofp i16 %masked to float
1307  ret float %itofp
1308}
1309
1310define float @v_test_uitofp_i16_byte_to_f32(i16 %arg0) {
1311; SI-LABEL: v_test_uitofp_i16_byte_to_f32:
1312; SI:       ; %bb.0:
1313; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1314; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
1315; SI-NEXT:    v_bfe_u32 v0, v0, 0, 16
1316; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
1317; SI-NEXT:    s_setpc_b64 s[30:31]
1318;
1319; VI-LABEL: v_test_uitofp_i16_byte_to_f32:
1320; VI:       ; %bb.0:
1321; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1322; VI-NEXT:    v_and_b32_e32 v0, 0xff, v0
1323; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
1324; VI-NEXT:    s_setpc_b64 s[30:31]
1325  %masked = and i16 %arg0, 255
1326  %itofp = uitofp i16 %masked to float
1327  ret float %itofp
1328}
1329