1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,SI
3; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,VI
4; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10
5
6declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
7declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
8
9define float @v_uitofp_i32_to_f32_mask255(i32 %arg0) nounwind {
10; GCN-LABEL: v_uitofp_i32_to_f32_mask255:
11; GCN:       ; %bb.0:
12; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13; GCN-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
14; GCN-NEXT:    s_setpc_b64 s[30:31]
15;
16; GFX10-LABEL: v_uitofp_i32_to_f32_mask255:
17; GFX10:       ; %bb.0:
18; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
20; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
21; GFX10-NEXT:    s_setpc_b64 s[30:31]
22  %masked = and i32 %arg0, 255
23  %cvt = uitofp i32 %masked to float
24  ret float %cvt
25}
26
27define float @v_sitofp_i32_to_f32_mask255(i32 %arg0) nounwind {
28; GCN-LABEL: v_sitofp_i32_to_f32_mask255:
29; GCN:       ; %bb.0:
30; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31; GCN-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
32; GCN-NEXT:    s_setpc_b64 s[30:31]
33;
34; GFX10-LABEL: v_sitofp_i32_to_f32_mask255:
35; GFX10:       ; %bb.0:
36; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
38; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
39; GFX10-NEXT:    s_setpc_b64 s[30:31]
40  %masked = and i32 %arg0, 255
41  %cvt = sitofp i32 %masked to float
42  ret float %cvt
43}
44
45define float @v_uitofp_to_f32_lshr7_mask255(i32 %arg0) nounwind {
46; GCN-LABEL: v_uitofp_to_f32_lshr7_mask255:
47; GCN:       ; %bb.0:
48; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
49; GCN-NEXT:    v_lshrrev_b32_e32 v0, 7, v0
50; GCN-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
51; GCN-NEXT:    s_setpc_b64 s[30:31]
52;
53; GFX10-LABEL: v_uitofp_to_f32_lshr7_mask255:
54; GFX10:       ; %bb.0:
55; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
56; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
57; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 7, v0
58; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
59; GFX10-NEXT:    s_setpc_b64 s[30:31]
60  %lshr.7 = lshr i32 %arg0, 7
61  %masked = and i32 %lshr.7, 255
62  %cvt = uitofp i32 %masked to float
63  ret float %cvt
64}
65
66define float @v_uitofp_to_f32_lshr8_mask255(i32 %arg0) nounwind {
67; GCN-LABEL: v_uitofp_to_f32_lshr8_mask255:
68; GCN:       ; %bb.0:
69; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
70; GCN-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
71; GCN-NEXT:    s_setpc_b64 s[30:31]
72;
73; GFX10-LABEL: v_uitofp_to_f32_lshr8_mask255:
74; GFX10:       ; %bb.0:
75; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
76; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
77; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
78; GFX10-NEXT:    s_setpc_b64 s[30:31]
79  %lshr.8 = lshr i32 %arg0, 8
80  %masked = and i32 %lshr.8, 255
81  %cvt = uitofp i32 %masked to float
82  ret float %cvt
83}
84
85define float @v_uitofp_to_f32_multi_use_lshr8_mask255(i32 %arg0) nounwind {
86; SI-LABEL: v_uitofp_to_f32_multi_use_lshr8_mask255:
87; SI:       ; %bb.0:
88; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
89; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
90; SI-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
91; SI-NEXT:    s_mov_b32 s7, 0xf000
92; SI-NEXT:    s_mov_b32 s6, -1
93; SI-NEXT:    buffer_store_dword v1, off, s[4:7], 0
94; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
95; SI-NEXT:    s_setpc_b64 s[30:31]
96;
97; VI-LABEL: v_uitofp_to_f32_multi_use_lshr8_mask255:
98; VI:       ; %bb.0:
99; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
100; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
101; VI-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
102; VI-NEXT:    s_mov_b32 s7, 0xf000
103; VI-NEXT:    s_mov_b32 s6, -1
104; VI-NEXT:    buffer_store_dword v1, off, s[4:7], 0
105; VI-NEXT:    s_waitcnt vmcnt(0)
106; VI-NEXT:    s_setpc_b64 s[30:31]
107;
108; GFX10-LABEL: v_uitofp_to_f32_multi_use_lshr8_mask255:
109; GFX10:       ; %bb.0:
110; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
111; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
112; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
113; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
114; GFX10-NEXT:    global_store_dword v[0:1], v1, off
115; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
116; GFX10-NEXT:    s_setpc_b64 s[30:31]
117  %lshr.8 = lshr i32 %arg0, 8
118  store i32 %lshr.8, i32 addrspace(1)* undef
119  %masked = and i32 %lshr.8, 255
120  %cvt = uitofp i32 %masked to float
121  ret float %cvt
122}
123
124define float @v_uitofp_to_f32_lshr16_mask255(i32 %arg0) nounwind {
125; GCN-LABEL: v_uitofp_to_f32_lshr16_mask255:
126; GCN:       ; %bb.0:
127; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
128; GCN-NEXT:    v_cvt_f32_ubyte2_e32 v0, v0
129; GCN-NEXT:    s_setpc_b64 s[30:31]
130;
131; GFX10-LABEL: v_uitofp_to_f32_lshr16_mask255:
132; GFX10:       ; %bb.0:
133; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
134; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
135; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v0, v0
136; GFX10-NEXT:    s_setpc_b64 s[30:31]
137  %lshr.16 = lshr i32 %arg0, 16
138  %masked = and i32 %lshr.16, 255
139  %cvt = uitofp i32 %masked to float
140  ret float %cvt
141}
142
143define float @v_uitofp_to_f32_lshr24_mask255(i32 %arg0) nounwind {
144; GCN-LABEL: v_uitofp_to_f32_lshr24_mask255:
145; GCN:       ; %bb.0:
146; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
147; GCN-NEXT:    v_cvt_f32_ubyte3_e32 v0, v0
148; GCN-NEXT:    s_setpc_b64 s[30:31]
149;
150; GFX10-LABEL: v_uitofp_to_f32_lshr24_mask255:
151; GFX10:       ; %bb.0:
152; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
153; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
154; GFX10-NEXT:    v_cvt_f32_ubyte3_e32 v0, v0
155; GFX10-NEXT:    s_setpc_b64 s[30:31]
156  %lshr.16 = lshr i32 %arg0, 24
157  %masked = and i32 %lshr.16, 255
158  %cvt = uitofp i32 %masked to float
159  ret float %cvt
160}
161
162define float @v_uitofp_i8_to_f32(i8 %arg0) nounwind {
163; GCN-LABEL: v_uitofp_i8_to_f32:
164; GCN:       ; %bb.0:
165; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
166; GCN-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
167; GCN-NEXT:    s_setpc_b64 s[30:31]
168;
169; GFX10-LABEL: v_uitofp_i8_to_f32:
170; GFX10:       ; %bb.0:
171; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
172; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
173; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
174; GFX10-NEXT:    s_setpc_b64 s[30:31]
175  %cvt = uitofp i8 %arg0 to float
176  ret float %cvt
177}
178
179define <2 x float> @v_uitofp_v2i8_to_v2f32(i16 %arg0) nounwind {
180; GCN-LABEL: v_uitofp_v2i8_to_v2f32:
181; GCN:       ; %bb.0:
182; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
183; GCN-NEXT:    v_cvt_f32_ubyte0_e32 v2, v0
184; GCN-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
185; GCN-NEXT:    v_mov_b32_e32 v0, v2
186; GCN-NEXT:    s_setpc_b64 s[30:31]
187;
188; GFX10-LABEL: v_uitofp_v2i8_to_v2f32:
189; GFX10:       ; %bb.0:
190; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
191; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
192; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v2, v0
193; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
194; GFX10-NEXT:    v_mov_b32_e32 v0, v2
195; GFX10-NEXT:    s_setpc_b64 s[30:31]
196  %val = bitcast i16 %arg0 to <2 x i8>
197  %cvt = uitofp <2 x i8> %val to <2 x float>
198  ret <2 x float> %cvt
199}
200
201define <3 x float> @v_uitofp_v3i8_to_v3f32(i32 %arg0) nounwind {
202; GCN-LABEL: v_uitofp_v3i8_to_v3f32:
203; GCN:       ; %bb.0:
204; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
205; GCN-NEXT:    v_cvt_f32_ubyte0_e32 v3, v0
206; GCN-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
207; GCN-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
208; GCN-NEXT:    v_mov_b32_e32 v0, v3
209; GCN-NEXT:    s_setpc_b64 s[30:31]
210;
211; GFX10-LABEL: v_uitofp_v3i8_to_v3f32:
212; GFX10:       ; %bb.0:
213; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
214; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
215; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v3, v0
216; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
217; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
218; GFX10-NEXT:    v_mov_b32_e32 v0, v3
219; GFX10-NEXT:    s_setpc_b64 s[30:31]
220  %trunc = trunc i32 %arg0 to i24
221  %val = bitcast i24 %trunc to <3 x i8>
222  %cvt = uitofp <3 x i8> %val to <3 x float>
223  ret <3 x float> %cvt
224}
225
226define <4 x float> @v_uitofp_v4i8_to_v4f32(i32 %arg0) nounwind {
227; GCN-LABEL: v_uitofp_v4i8_to_v4f32:
228; GCN:       ; %bb.0:
229; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
230; GCN-NEXT:    v_cvt_f32_ubyte0_e32 v4, v0
231; GCN-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
232; GCN-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
233; GCN-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
234; GCN-NEXT:    v_mov_b32_e32 v0, v4
235; GCN-NEXT:    s_setpc_b64 s[30:31]
236;
237; GFX10-LABEL: v_uitofp_v4i8_to_v4f32:
238; GFX10:       ; %bb.0:
239; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
240; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
241; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v4, v0
242; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
243; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
244; GFX10-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
245; GFX10-NEXT:    v_mov_b32_e32 v0, v4
246; GFX10-NEXT:    s_setpc_b64 s[30:31]
247  %val = bitcast i32 %arg0 to <4 x i8>
248  %cvt = uitofp <4 x i8> %val to <4 x float>
249  ret <4 x float> %cvt
250}
251
252define <4 x float> @v_uitofp_unpack_i32_to_v4f32(i32 %arg0) nounwind {
253; GCN-LABEL: v_uitofp_unpack_i32_to_v4f32:
254; GCN:       ; %bb.0:
255; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
256; GCN-NEXT:    v_cvt_f32_ubyte0_e32 v4, v0
257; GCN-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
258; GCN-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
259; GCN-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
260; GCN-NEXT:    v_mov_b32_e32 v0, v4
261; GCN-NEXT:    s_setpc_b64 s[30:31]
262;
263; GFX10-LABEL: v_uitofp_unpack_i32_to_v4f32:
264; GFX10:       ; %bb.0:
265; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
266; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
267; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v4, v0
268; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
269; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
270; GFX10-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
271; GFX10-NEXT:    v_mov_b32_e32 v0, v4
272; GFX10-NEXT:    s_setpc_b64 s[30:31]
273  %mask.arg0 = and i32 %arg0, 255
274  %cvt0 = uitofp i32 %mask.arg0 to float
275
276  %lshr.8 = lshr i32 %arg0, 8
277  %mask.lshr.8 = and i32 %lshr.8, 255
278  %cvt1 = uitofp i32 %mask.lshr.8 to float
279
280  %lshr.16 = lshr i32 %arg0, 16
281  %mask.lshr.16 = and i32 %lshr.16, 255
282  %cvt2 = uitofp i32 %mask.lshr.16 to float
283
284  %lshr.24 = lshr i32 %arg0, 24
285  %mask.lshr.24 = and i32 %lshr.24, 255
286  %cvt3 = uitofp i32 %mask.lshr.24 to float
287
288  %ins.0 = insertelement <4 x float> undef, float %cvt0, i32 0
289  %ins.1 = insertelement <4 x float> %ins.0, float %cvt1, i32 1
290  %ins.2 = insertelement <4 x float> %ins.1, float %cvt2, i32 2
291  %ins.3 = insertelement <4 x float> %ins.2, float %cvt3, i32 3
292  ret <4 x float> %ins.3
293}
294
295define half @v_uitofp_i32_to_f16_mask255(i32 %arg0) nounwind {
296; SI-LABEL: v_uitofp_i32_to_f16_mask255:
297; SI:       ; %bb.0:
298; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
299; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
300; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
301; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
302; SI-NEXT:    s_setpc_b64 s[30:31]
303;
304; VI-LABEL: v_uitofp_i32_to_f16_mask255:
305; VI:       ; %bb.0:
306; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
307; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
308; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
309; VI-NEXT:    s_setpc_b64 s[30:31]
310;
311; GFX10-LABEL: v_uitofp_i32_to_f16_mask255:
312; GFX10:       ; %bb.0:
313; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
314; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
315; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
316; GFX10-NEXT:    v_cvt_f16_f32_e32 v0, v0
317; GFX10-NEXT:    s_setpc_b64 s[30:31]
318  %masked = and i32 %arg0, 255
319  %cvt = uitofp i32 %masked to half
320  ret half %cvt
321}
322
323define half @v_sitofp_i32_to_f16_mask255(i32 %arg0) nounwind {
324; SI-LABEL: v_sitofp_i32_to_f16_mask255:
325; SI:       ; %bb.0:
326; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
327; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
328; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
329; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
330; SI-NEXT:    s_setpc_b64 s[30:31]
331;
332; VI-LABEL: v_sitofp_i32_to_f16_mask255:
333; VI:       ; %bb.0:
334; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
335; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
336; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
337; VI-NEXT:    s_setpc_b64 s[30:31]
338;
339; GFX10-LABEL: v_sitofp_i32_to_f16_mask255:
340; GFX10:       ; %bb.0:
341; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
342; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
343; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
344; GFX10-NEXT:    v_cvt_f16_f32_e32 v0, v0
345; GFX10-NEXT:    s_setpc_b64 s[30:31]
346  %masked = and i32 %arg0, 255
347  %cvt = sitofp i32 %masked to half
348  ret half %cvt
349}
350
351define half @v_uitofp_to_f16_lshr8_mask255(i32 %arg0) nounwind {
352; SI-LABEL: v_uitofp_to_f16_lshr8_mask255:
353; SI:       ; %bb.0:
354; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
355; SI-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
356; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
357; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
358; SI-NEXT:    s_setpc_b64 s[30:31]
359;
360; VI-LABEL: v_uitofp_to_f16_lshr8_mask255:
361; VI:       ; %bb.0:
362; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
363; VI-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
364; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
365; VI-NEXT:    s_setpc_b64 s[30:31]
366;
367; GFX10-LABEL: v_uitofp_to_f16_lshr8_mask255:
368; GFX10:       ; %bb.0:
369; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
370; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
371; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
372; GFX10-NEXT:    v_cvt_f16_f32_e32 v0, v0
373; GFX10-NEXT:    s_setpc_b64 s[30:31]
374  %lshr.8 = lshr i32 %arg0, 8
375  %masked = and i32 %lshr.8, 255
376  %cvt = uitofp i32 %masked to half
377  ret half %cvt
378}
379
380define half @v_uitofp_to_f16_lshr16_mask255(i32 %arg0) nounwind {
381; SI-LABEL: v_uitofp_to_f16_lshr16_mask255:
382; SI:       ; %bb.0:
383; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
384; SI-NEXT:    v_cvt_f32_ubyte2_e32 v0, v0
385; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
386; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
387; SI-NEXT:    s_setpc_b64 s[30:31]
388;
389; VI-LABEL: v_uitofp_to_f16_lshr16_mask255:
390; VI:       ; %bb.0:
391; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
392; VI-NEXT:    v_cvt_f32_ubyte2_e32 v0, v0
393; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
394; VI-NEXT:    s_setpc_b64 s[30:31]
395;
396; GFX10-LABEL: v_uitofp_to_f16_lshr16_mask255:
397; GFX10:       ; %bb.0:
398; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
399; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
400; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v0, v0
401; GFX10-NEXT:    v_cvt_f16_f32_e32 v0, v0
402; GFX10-NEXT:    s_setpc_b64 s[30:31]
403  %lshr.16 = lshr i32 %arg0, 16
404  %masked = and i32 %lshr.16, 255
405  %cvt = uitofp i32 %masked to half
406  ret half %cvt
407}
408
409define half @v_uitofp_to_f16_lshr24_mask255(i32 %arg0) nounwind {
410; SI-LABEL: v_uitofp_to_f16_lshr24_mask255:
411; SI:       ; %bb.0:
412; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
413; SI-NEXT:    v_cvt_f32_ubyte3_e32 v0, v0
414; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
415; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
416; SI-NEXT:    s_setpc_b64 s[30:31]
417;
418; VI-LABEL: v_uitofp_to_f16_lshr24_mask255:
419; VI:       ; %bb.0:
420; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
421; VI-NEXT:    v_cvt_f32_ubyte3_e32 v0, v0
422; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
423; VI-NEXT:    s_setpc_b64 s[30:31]
424;
425; GFX10-LABEL: v_uitofp_to_f16_lshr24_mask255:
426; GFX10:       ; %bb.0:
427; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
428; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
429; GFX10-NEXT:    v_cvt_f32_ubyte3_e32 v0, v0
430; GFX10-NEXT:    v_cvt_f16_f32_e32 v0, v0
431; GFX10-NEXT:    s_setpc_b64 s[30:31]
432  %lshr.16 = lshr i32 %arg0, 24
433  %masked = and i32 %lshr.16, 255
434  %cvt = uitofp i32 %masked to half
435  ret half %cvt
436}
437
438define half @v_uitofp_i8_to_f16(i8 %arg0) nounwind {
439; SI-LABEL: v_uitofp_i8_to_f16:
440; SI:       ; %bb.0:
441; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
442; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
443; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
444; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
445; SI-NEXT:    s_setpc_b64 s[30:31]
446;
447; VI-LABEL: v_uitofp_i8_to_f16:
448; VI:       ; %bb.0:
449; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
450; VI-NEXT:    v_cvt_f16_u16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
451; VI-NEXT:    s_setpc_b64 s[30:31]
452;
453; GFX10-LABEL: v_uitofp_i8_to_f16:
454; GFX10:       ; %bb.0:
455; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
456; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
457; GFX10-NEXT:    v_cvt_f16_u16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
458; GFX10-NEXT:    s_setpc_b64 s[30:31]
459  %cvt = uitofp i8 %arg0 to half
460  ret half %cvt
461}
462
463define double @v_uitofp_i32_to_f64_mask255(i32 %arg0) nounwind {
464; GCN-LABEL: v_uitofp_i32_to_f64_mask255:
465; GCN:       ; %bb.0:
466; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
467; GCN-NEXT:    v_and_b32_e32 v0, 0xff, v0
468; GCN-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
469; GCN-NEXT:    s_setpc_b64 s[30:31]
470;
471; GFX10-LABEL: v_uitofp_i32_to_f64_mask255:
472; GFX10:       ; %bb.0:
473; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
474; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
475; GFX10-NEXT:    v_and_b32_e32 v0, 0xff, v0
476; GFX10-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
477; GFX10-NEXT:    s_setpc_b64 s[30:31]
478  %masked = and i32 %arg0, 255
479  %cvt = uitofp i32 %masked to double
480  ret double %cvt
481}
482
483define double @v_uitofp_to_f64_lshr8_mask255(i32 %arg0) nounwind {
484; GCN-LABEL: v_uitofp_to_f64_lshr8_mask255:
485; GCN:       ; %bb.0:
486; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
487; GCN-NEXT:    v_bfe_u32 v0, v0, 8, 8
488; GCN-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
489; GCN-NEXT:    s_setpc_b64 s[30:31]
490;
491; GFX10-LABEL: v_uitofp_to_f64_lshr8_mask255:
492; GFX10:       ; %bb.0:
493; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
494; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
495; GFX10-NEXT:    v_bfe_u32 v0, v0, 8, 8
496; GFX10-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
497; GFX10-NEXT:    s_setpc_b64 s[30:31]
498  %lshr.8 = lshr i32 %arg0, 8
499  %masked = and i32 %lshr.8, 255
500  %cvt = uitofp i32 %masked to double
501  ret double %cvt
502}
503
504define double @v_uitofp_to_f64_lshr16_mask255(i32 %arg0) nounwind {
505; GCN-LABEL: v_uitofp_to_f64_lshr16_mask255:
506; GCN:       ; %bb.0:
507; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
508; GCN-NEXT:    v_bfe_u32 v0, v0, 16, 8
509; GCN-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
510; GCN-NEXT:    s_setpc_b64 s[30:31]
511;
512; GFX10-LABEL: v_uitofp_to_f64_lshr16_mask255:
513; GFX10:       ; %bb.0:
514; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
515; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
516; GFX10-NEXT:    v_bfe_u32 v0, v0, 16, 8
517; GFX10-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
518; GFX10-NEXT:    s_setpc_b64 s[30:31]
519  %lshr.16 = lshr i32 %arg0, 16
520  %masked = and i32 %lshr.16, 255
521  %cvt = uitofp i32 %masked to double
522  ret double %cvt
523}
524
525define double @v_uitofp_to_f64_lshr24_mask255(i32 %arg0) nounwind {
526; GCN-LABEL: v_uitofp_to_f64_lshr24_mask255:
527; GCN:       ; %bb.0:
528; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
529; GCN-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
530; GCN-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
531; GCN-NEXT:    s_setpc_b64 s[30:31]
532;
533; GFX10-LABEL: v_uitofp_to_f64_lshr24_mask255:
534; GFX10:       ; %bb.0:
535; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
536; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
537; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
538; GFX10-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
539; GFX10-NEXT:    s_setpc_b64 s[30:31]
540  %lshr.16 = lshr i32 %arg0, 24
541  %masked = and i32 %lshr.16, 255
542  %cvt = uitofp i32 %masked to double
543  ret double %cvt
544}
545
546define double @v_uitofp_i8_to_f64(i8 %arg0) nounwind {
547; SI-LABEL: v_uitofp_i8_to_f64:
548; SI:       ; %bb.0:
549; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
550; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
551; SI-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
552; SI-NEXT:    s_setpc_b64 s[30:31]
553;
554; VI-LABEL: v_uitofp_i8_to_f64:
555; VI:       ; %bb.0:
556; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
557; VI-NEXT:    v_mov_b32_e32 v1, 0xffff
558; VI-NEXT:    v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
559; VI-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
560; VI-NEXT:    s_setpc_b64 s[30:31]
561;
562; GFX10-LABEL: v_uitofp_i8_to_f64:
563; GFX10:       ; %bb.0:
564; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
565; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
566; GFX10-NEXT:    v_mov_b32_e32 v1, 0xffff
567; GFX10-NEXT:    v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
568; GFX10-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
569; GFX10-NEXT:    s_setpc_b64 s[30:31]
570  %cvt = uitofp i8 %arg0 to double
571  ret double %cvt
572}
573
574define amdgpu_kernel void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
575; SI-LABEL: load_i8_to_f32:
576; SI:       ; %bb.0:
577; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
578; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
579; SI-NEXT:    s_mov_b32 s7, 0xf000
580; SI-NEXT:    v_mov_b32_e32 v1, 0
581; SI-NEXT:    s_mov_b32 s2, 0
582; SI-NEXT:    s_mov_b32 s3, s7
583; SI-NEXT:    s_waitcnt lgkmcnt(0)
584; SI-NEXT:    buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64
585; SI-NEXT:    s_mov_b32 s6, -1
586; SI-NEXT:    s_waitcnt vmcnt(0)
587; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
588; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
589; SI-NEXT:    s_endpgm
590;
591; VI-LABEL: load_i8_to_f32:
592; VI:       ; %bb.0:
593; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
594; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
595; VI-NEXT:    s_mov_b32 s7, 0xf000
596; VI-NEXT:    s_mov_b32 s6, -1
597; VI-NEXT:    s_waitcnt lgkmcnt(0)
598; VI-NEXT:    v_mov_b32_e32 v1, s1
599; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
600; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
601; VI-NEXT:    flat_load_ubyte v0, v[0:1]
602; VI-NEXT:    s_waitcnt vmcnt(0)
603; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
604; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
605; VI-NEXT:    s_endpgm
606;
607; GFX10-LABEL: load_i8_to_f32:
608; GFX10:       ; %bb.0:
609; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
610; GFX10-NEXT:    v_mov_b32_e32 v1, 0
611; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
612; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
613; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3]
614; GFX10-NEXT:    s_waitcnt vmcnt(0)
615; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
616; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
617; GFX10-NEXT:    s_endpgm
618  %tid = call i32 @llvm.amdgcn.workitem.id.x()
619  %gep = getelementptr i8, i8 addrspace(1)* %in, i32 %tid
620  %load = load i8, i8 addrspace(1)* %gep, align 1
621  %cvt = uitofp i8 %load to float
622  store float %cvt, float addrspace(1)* %out, align 4
623  ret void
624}
625
626define amdgpu_kernel void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> addrspace(1)* noalias %in) nounwind {
627; SI-LABEL: load_v2i8_to_v2f32:
628; SI:       ; %bb.0:
629; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
630; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
631; SI-NEXT:    s_mov_b32 s7, 0xf000
632; SI-NEXT:    s_mov_b32 s2, 0
633; SI-NEXT:    s_mov_b32 s3, s7
634; SI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
635; SI-NEXT:    v_mov_b32_e32 v1, 0
636; SI-NEXT:    s_waitcnt lgkmcnt(0)
637; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64
638; SI-NEXT:    s_mov_b32 s6, -1
639; SI-NEXT:    s_waitcnt vmcnt(0)
640; SI-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
641; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
642; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
643; SI-NEXT:    s_endpgm
644;
645; VI-LABEL: load_v2i8_to_v2f32:
646; VI:       ; %bb.0:
647; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
648; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
649; VI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
650; VI-NEXT:    s_mov_b32 s7, 0xf000
651; VI-NEXT:    s_mov_b32 s6, -1
652; VI-NEXT:    s_waitcnt lgkmcnt(0)
653; VI-NEXT:    v_mov_b32_e32 v1, s1
654; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
655; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
656; VI-NEXT:    flat_load_ushort v0, v[0:1]
657; VI-NEXT:    s_waitcnt vmcnt(0)
658; VI-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
659; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
660; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
661; VI-NEXT:    s_endpgm
662;
663; GFX10-LABEL: load_v2i8_to_v2f32:
664; GFX10:       ; %bb.0:
665; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
666; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
667; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
668; GFX10-NEXT:    v_mov_b32_e32 v2, 0
669; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
670; GFX10-NEXT:    global_load_ushort v0, v0, s[2:3]
671; GFX10-NEXT:    s_waitcnt vmcnt(0)
672; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
673; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
674; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
675; GFX10-NEXT:    s_endpgm
676  %tid = call i32 @llvm.amdgcn.workitem.id.x()
677  %gep = getelementptr <2 x i8>, <2 x i8> addrspace(1)* %in, i32 %tid
678  %load = load <2 x i8>, <2 x i8> addrspace(1)* %gep, align 2
679  %cvt = uitofp <2 x i8> %load to <2 x float>
680  store <2 x float> %cvt, <2 x float> addrspace(1)* %out, align 16
681  ret void
682}
683
684define amdgpu_kernel void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind {
685; SI-LABEL: load_v3i8_to_v3f32:
686; SI:       ; %bb.0:
687; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
688; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
689; SI-NEXT:    s_mov_b32 s7, 0xf000
690; SI-NEXT:    s_mov_b32 s2, 0
691; SI-NEXT:    s_mov_b32 s3, s7
692; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
693; SI-NEXT:    v_mov_b32_e32 v1, 0
694; SI-NEXT:    s_waitcnt lgkmcnt(0)
695; SI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
696; SI-NEXT:    s_mov_b32 s6, -1
697; SI-NEXT:    s_waitcnt vmcnt(0)
698; SI-NEXT:    v_cvt_f32_ubyte1_e32 v1, v2
699; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v2
700; SI-NEXT:    v_cvt_f32_ubyte2_e32 v2, v2
701; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:8
702; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
703; SI-NEXT:    s_endpgm
704;
705; VI-LABEL: load_v3i8_to_v3f32:
706; VI:       ; %bb.0:
707; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
708; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
709; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
710; VI-NEXT:    s_mov_b32 s7, 0xf000
711; VI-NEXT:    s_mov_b32 s6, -1
712; VI-NEXT:    s_waitcnt lgkmcnt(0)
713; VI-NEXT:    v_mov_b32_e32 v1, s1
714; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
715; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
716; VI-NEXT:    flat_load_dword v0, v[0:1]
717; VI-NEXT:    s_waitcnt vmcnt(0)
718; VI-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
719; VI-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
720; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
721; VI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[4:7], 0
722; VI-NEXT:    s_endpgm
723;
724; GFX10-LABEL: load_v3i8_to_v3f32:
725; GFX10:       ; %bb.0:
726; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
727; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
728; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
729; GFX10-NEXT:    v_mov_b32_e32 v3, 0
730; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
731; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
732; GFX10-NEXT:    s_waitcnt vmcnt(0)
733; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
734; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
735; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
736; GFX10-NEXT:    global_store_dwordx3 v3, v[0:2], s[0:1]
737; GFX10-NEXT:    s_endpgm
738  %tid = call i32 @llvm.amdgcn.workitem.id.x()
739  %gep = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %in, i32 %tid
740  %load = load <3 x i8>, <3 x i8> addrspace(1)* %gep, align 4
741  %cvt = uitofp <3 x i8> %load to <3 x float>
742  store <3 x float> %cvt, <3 x float> addrspace(1)* %out, align 16
743  ret void
744}
745
746define amdgpu_kernel void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
747; SI-LABEL: load_v4i8_to_v4f32:
748; SI:       ; %bb.0:
749; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
750; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
751; SI-NEXT:    s_mov_b32 s7, 0xf000
752; SI-NEXT:    s_mov_b32 s2, 0
753; SI-NEXT:    s_mov_b32 s3, s7
754; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
755; SI-NEXT:    v_mov_b32_e32 v1, 0
756; SI-NEXT:    s_waitcnt lgkmcnt(0)
757; SI-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
758; SI-NEXT:    s_mov_b32 s6, -1
759; SI-NEXT:    s_waitcnt vmcnt(0)
760; SI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
761; SI-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
762; SI-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
763; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
764; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
765; SI-NEXT:    s_endpgm
766;
767; VI-LABEL: load_v4i8_to_v4f32:
768; VI:       ; %bb.0:
769; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
770; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
771; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
772; VI-NEXT:    s_mov_b32 s7, 0xf000
773; VI-NEXT:    s_mov_b32 s6, -1
774; VI-NEXT:    s_waitcnt lgkmcnt(0)
775; VI-NEXT:    v_mov_b32_e32 v1, s1
776; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
777; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
778; VI-NEXT:    flat_load_dword v0, v[0:1]
779; VI-NEXT:    s_waitcnt vmcnt(0)
780; VI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
781; VI-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
782; VI-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
783; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
784; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
785; VI-NEXT:    s_endpgm
786;
787; GFX10-LABEL: load_v4i8_to_v4f32:
788; GFX10:       ; %bb.0:
789; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
790; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
791; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
792; GFX10-NEXT:    v_mov_b32_e32 v4, 0
793; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
794; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
795; GFX10-NEXT:    s_waitcnt vmcnt(0)
796; GFX10-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
797; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
798; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
799; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
800; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
801; GFX10-NEXT:    s_endpgm
802  %tid = call i32 @llvm.amdgcn.workitem.id.x()
803  %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid
804  %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
805  %cvt = uitofp <4 x i8> %load to <4 x float>
806  store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
807  ret void
808}
809
810; This should not be adding instructions to shift into the correct
811; position in the word for the component.
812
813; FIXME: Packing bytes
814define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
815; SI-LABEL: load_v4i8_to_v4f32_unaligned:
816; SI:       ; %bb.0:
817; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
818; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
819; SI-NEXT:    s_mov_b32 s7, 0xf000
820; SI-NEXT:    s_mov_b32 s2, 0
821; SI-NEXT:    s_mov_b32 s3, s7
822; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
823; SI-NEXT:    v_mov_b32_e32 v1, 0
824; SI-NEXT:    s_waitcnt lgkmcnt(0)
825; SI-NEXT:    buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64
826; SI-NEXT:    buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64 offset:1
827; SI-NEXT:    buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:2
828; SI-NEXT:    buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:3
829; SI-NEXT:    s_mov_b32 s6, -1
830; SI-NEXT:    s_waitcnt vmcnt(2)
831; SI-NEXT:    v_cvt_f32_ubyte2_e32 v1, v2
832; SI-NEXT:    s_waitcnt vmcnt(0)
833; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
834; SI-NEXT:    v_or_b32_e32 v0, v0, v3
835; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
836; SI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
837; SI-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
838; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v4
839; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
840; SI-NEXT:    s_endpgm
841;
842; VI-LABEL: load_v4i8_to_v4f32_unaligned:
843; VI:       ; %bb.0:
844; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
845; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
846; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
847; VI-NEXT:    s_mov_b32 s7, 0xf000
848; VI-NEXT:    s_mov_b32 s6, -1
849; VI-NEXT:    s_waitcnt lgkmcnt(0)
850; VI-NEXT:    v_mov_b32_e32 v1, s1
851; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
852; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
853; VI-NEXT:    v_add_u32_e32 v2, vcc, 1, v0
854; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
855; VI-NEXT:    v_add_u32_e32 v4, vcc, 2, v0
856; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
857; VI-NEXT:    v_add_u32_e32 v6, vcc, 3, v0
858; VI-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
859; VI-NEXT:    flat_load_ubyte v4, v[4:5]
860; VI-NEXT:    flat_load_ubyte v5, v[6:7]
861; VI-NEXT:    flat_load_ubyte v6, v[2:3]
862; VI-NEXT:    flat_load_ubyte v0, v[0:1]
863; VI-NEXT:    s_waitcnt vmcnt(3)
864; VI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v4
865; VI-NEXT:    s_waitcnt vmcnt(2)
866; VI-NEXT:    v_cvt_f32_ubyte0_e32 v3, v5
867; VI-NEXT:    s_waitcnt vmcnt(1)
868; VI-NEXT:    v_cvt_f32_ubyte2_e32 v1, v6
869; VI-NEXT:    s_waitcnt vmcnt(0)
870; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
871; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
872; VI-NEXT:    s_endpgm
873;
874; GFX10-LABEL: load_v4i8_to_v4f32_unaligned:
875; GFX10:       ; %bb.0:
876; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
877; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
878; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
879; GFX10-NEXT:    v_mov_b32_e32 v6, 0
880; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
881; GFX10-NEXT:    s_clause 0x3
882; GFX10-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:3
883; GFX10-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:2
884; GFX10-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:1
885; GFX10-NEXT:    global_load_ubyte v5, v0, s[2:3]
886; GFX10-NEXT:    s_waitcnt vmcnt(3)
887; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v3, v1
888; GFX10-NEXT:    s_waitcnt vmcnt(2)
889; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v2, v2
890; GFX10-NEXT:    s_waitcnt vmcnt(1)
891; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v1, v4
892; GFX10-NEXT:    s_waitcnt vmcnt(0)
893; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v5
894; GFX10-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
895; GFX10-NEXT:    s_endpgm
896  %tid = call i32 @llvm.amdgcn.workitem.id.x()
897  %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid
898  %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 1
899  %cvt = uitofp <4 x i8> %load to <4 x float>
900  store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
901  ret void
902}
903
904; FIXME: Need to handle non-uniform case for function below (load without gep).
905; Instructions still emitted to repack bytes for add use.
906define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind {
907; SI-LABEL: load_v4i8_to_v4f32_2_uses:
908; SI:       ; %bb.0:
909; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
910; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xb
911; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
912; SI-NEXT:    s_mov_b32 s11, 0xf000
913; SI-NEXT:    s_mov_b32 s2, 0
914; SI-NEXT:    s_mov_b32 s3, s11
915; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
916; SI-NEXT:    v_mov_b32_e32 v1, 0
917; SI-NEXT:    s_waitcnt lgkmcnt(0)
918; SI-NEXT:    buffer_load_dword v4, v[0:1], s[0:3], 0 addr64
919; SI-NEXT:    s_mov_b32 s10, -1
920; SI-NEXT:    s_movk_i32 s0, 0xff
921; SI-NEXT:    s_mov_b32 s6, s10
922; SI-NEXT:    s_mov_b32 s7, s11
923; SI-NEXT:    s_waitcnt vmcnt(0)
924; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
925; SI-NEXT:    v_lshrrev_b32_e32 v6, 24, v4
926; SI-NEXT:    v_and_b32_e32 v7, 0xff00, v4
927; SI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v4
928; SI-NEXT:    v_cvt_f32_ubyte2_e32 v2, v4
929; SI-NEXT:    v_cvt_f32_ubyte1_e32 v1, v4
930; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v4
931; SI-NEXT:    v_add_i32_e32 v4, vcc, 9, v4
932; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
933; SI-NEXT:    s_waitcnt expcnt(0)
934; SI-NEXT:    v_and_b32_e32 v0, s0, v4
935; SI-NEXT:    v_add_i32_e32 v2, vcc, 9, v5
936; SI-NEXT:    v_or_b32_e32 v0, v7, v0
937; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v6
938; SI-NEXT:    v_and_b32_e32 v2, s0, v2
939; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x900, v0
940; SI-NEXT:    v_or_b32_e32 v1, v1, v2
941; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
942; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
943; SI-NEXT:    v_or_b32_e32 v0, v1, v0
944; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x9000000, v0
945; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
946; SI-NEXT:    s_endpgm
947;
948; VI-LABEL: load_v4i8_to_v4f32_2_uses:
949; VI:       ; %bb.0:
950; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
951; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x2c
952; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
953; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
954; VI-NEXT:    s_mov_b32 s11, 0xf000
955; VI-NEXT:    s_mov_b32 s10, -1
956; VI-NEXT:    v_mov_b32_e32 v5, 9
957; VI-NEXT:    s_waitcnt lgkmcnt(0)
958; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
959; VI-NEXT:    v_mov_b32_e32 v1, s1
960; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
961; VI-NEXT:    flat_load_dword v4, v[0:1]
962; VI-NEXT:    s_mov_b32 s6, s10
963; VI-NEXT:    s_mov_b32 s7, s11
964; VI-NEXT:    s_movk_i32 s0, 0x900
965; VI-NEXT:    s_waitcnt vmcnt(0)
966; VI-NEXT:    v_lshrrev_b32_e32 v6, 24, v4
967; VI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v4
968; VI-NEXT:    v_cvt_f32_ubyte2_e32 v2, v4
969; VI-NEXT:    v_cvt_f32_ubyte1_e32 v1, v4
970; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v4
971; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
972; VI-NEXT:    v_and_b32_e32 v7, 0xffffff00, v4
973; VI-NEXT:    v_add_u16_e32 v8, 9, v4
974; VI-NEXT:    v_add_u16_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
975; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v6
976; VI-NEXT:    v_or_b32_sdwa v0, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
977; VI-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
978; VI-NEXT:    v_mov_b32_e32 v2, s0
979; VI-NEXT:    v_add_u16_e32 v0, s0, v0
980; VI-NEXT:    v_add_u16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
981; VI-NEXT:    v_or_b32_e32 v0, v0, v1
982; VI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
983; VI-NEXT:    s_endpgm
984;
985; GFX10-LABEL: load_v4i8_to_v4f32_2_uses:
986; GFX10:       ; %bb.0:
987; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
988; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
989; GFX10-NEXT:    v_mov_b32_e32 v1, 24
990; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
991; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
992; GFX10-NEXT:    s_clause 0x1
993; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
994; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
995; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
996; GFX10-NEXT:    s_movk_i32 s0, 0x900
997; GFX10-NEXT:    s_waitcnt vmcnt(0)
998; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
999; GFX10-NEXT:    v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1000; GFX10-NEXT:    v_and_b32_e32 v3, 0xffffff00, v0
1001; GFX10-NEXT:    v_add_nc_u16 v4, v0, 9
1002; GFX10-NEXT:    v_add_nc_u16 v2, v2, 9
1003; GFX10-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1004; GFX10-NEXT:    v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1005; GFX10-NEXT:    v_mov_b32_e32 v4, 0
1006; GFX10-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
1007; GFX10-NEXT:    v_add_nc_u16 v1, v1, s0
1008; GFX10-NEXT:    v_add_nc_u16 v5, v2, s0
1009; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
1010; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
1011; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
1012; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
1013; GFX10-NEXT:    v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1014; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1015; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
1016; GFX10-NEXT:    global_store_dword v4, v5, s[4:5]
1017; GFX10-NEXT:    s_endpgm
1018  %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
1019  %in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
1020  %load = load <4 x i8>, <4 x i8> addrspace(1)* %in.ptr, align 4
1021  %cvt = uitofp <4 x i8> %load to <4 x float>
1022  store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
1023  %add = add <4 x i8> %load, <i8 9, i8 9, i8 9, i8 9> ; Second use of %load
1024  store <4 x i8> %add, <4 x i8> addrspace(1)* %out2, align 4
1025  ret void
1026}
1027
1028; Make sure this doesn't crash.
1029define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8> addrspace(1)* noalias %in) nounwind {
1030; SI-LABEL: load_v7i8_to_v7f32:
1031; SI:       ; %bb.0:
1032; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1033; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1034; SI-NEXT:    s_mov_b32 s7, 0xf000
1035; SI-NEXT:    s_mov_b32 s2, 0
1036; SI-NEXT:    s_mov_b32 s3, s7
1037; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
1038; SI-NEXT:    v_mov_b32_e32 v1, 0
1039; SI-NEXT:    s_waitcnt lgkmcnt(0)
1040; SI-NEXT:    buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64
1041; SI-NEXT:    buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:1
1042; SI-NEXT:    buffer_load_ubyte v6, v[0:1], s[0:3], 0 addr64 offset:2
1043; SI-NEXT:    buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 offset:3
1044; SI-NEXT:    buffer_load_ubyte v7, v[0:1], s[0:3], 0 addr64 offset:4
1045; SI-NEXT:    buffer_load_ubyte v5, v[0:1], s[0:3], 0 addr64 offset:5
1046; SI-NEXT:    buffer_load_ubyte v8, v[0:1], s[0:3], 0 addr64 offset:6
1047; SI-NEXT:    s_mov_b32 s6, -1
1048; SI-NEXT:    s_waitcnt vmcnt(6)
1049; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v2
1050; SI-NEXT:    s_waitcnt vmcnt(5)
1051; SI-NEXT:    v_cvt_f32_ubyte2_e32 v1, v3
1052; SI-NEXT:    s_waitcnt vmcnt(3)
1053; SI-NEXT:    v_lshlrev_b32_e32 v9, 8, v4
1054; SI-NEXT:    v_or_b32_e32 v3, v9, v6
1055; SI-NEXT:    s_waitcnt vmcnt(1)
1056; SI-NEXT:    v_cvt_f32_ubyte2_e32 v5, v5
1057; SI-NEXT:    s_waitcnt vmcnt(0)
1058; SI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v8
1059; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:24
1060; SI-NEXT:    s_waitcnt expcnt(0)
1061; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
1062; SI-NEXT:    v_cvt_f32_ubyte0_e32 v4, v7
1063; SI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v2
1064; SI-NEXT:    v_cvt_f32_ubyte2_e32 v2, v2
1065; SI-NEXT:    buffer_store_dwordx2 v[4:5], off, s[4:7], 0 offset:16
1066; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1067; SI-NEXT:    s_endpgm
1068;
1069; VI-LABEL: load_v7i8_to_v7f32:
1070; VI:       ; %bb.0:
1071; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1072; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
1073; VI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
1074; VI-NEXT:    s_mov_b32 s7, 0xf000
1075; VI-NEXT:    s_mov_b32 s6, -1
1076; VI-NEXT:    s_waitcnt lgkmcnt(0)
1077; VI-NEXT:    v_mov_b32_e32 v1, s1
1078; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
1079; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1080; VI-NEXT:    v_add_u32_e32 v2, vcc, 3, v0
1081; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
1082; VI-NEXT:    v_add_u32_e32 v4, vcc, 2, v0
1083; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
1084; VI-NEXT:    v_add_u32_e32 v6, vcc, 4, v0
1085; VI-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
1086; VI-NEXT:    v_add_u32_e32 v8, vcc, 5, v0
1087; VI-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
1088; VI-NEXT:    flat_load_ubyte v10, v[4:5]
1089; VI-NEXT:    flat_load_ubyte v11, v[6:7]
1090; VI-NEXT:    flat_load_ubyte v8, v[8:9]
1091; VI-NEXT:    v_add_u32_e32 v4, vcc, 6, v0
1092; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
1093; VI-NEXT:    v_add_u32_e32 v6, vcc, 1, v0
1094; VI-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
1095; VI-NEXT:    flat_load_ubyte v6, v[6:7]
1096; VI-NEXT:    flat_load_ubyte v4, v[4:5]
1097; VI-NEXT:    flat_load_ubyte v2, v[2:3]
1098; VI-NEXT:    flat_load_ubyte v0, v[0:1]
1099; VI-NEXT:    s_waitcnt vmcnt(4)
1100; VI-NEXT:    v_cvt_f32_ubyte2_e32 v5, v8
1101; VI-NEXT:    s_waitcnt vmcnt(3)
1102; VI-NEXT:    v_cvt_f32_ubyte2_e32 v1, v6
1103; VI-NEXT:    s_waitcnt vmcnt(2)
1104; VI-NEXT:    v_cvt_f32_ubyte0_e32 v6, v4
1105; VI-NEXT:    s_waitcnt vmcnt(1)
1106; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
1107; VI-NEXT:    v_or_b32_sdwa v2, v2, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1108; VI-NEXT:    v_cvt_f32_ubyte0_e32 v4, v11
1109; VI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v2
1110; VI-NEXT:    s_waitcnt vmcnt(0)
1111; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
1112; VI-NEXT:    v_cvt_f32_ubyte2_e32 v2, v2
1113; VI-NEXT:    buffer_store_dwordx3 v[4:6], off, s[4:7], 0 offset:16
1114; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1115; VI-NEXT:    s_endpgm
1116;
1117; GFX10-LABEL: load_v7i8_to_v7f32:
1118; GFX10:       ; %bb.0:
1119; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1120; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
1121; GFX10-NEXT:    v_mov_b32_e32 v2, 0
1122; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1123; GFX10-NEXT:    v_mov_b32_e32 v8, 0
1124; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1125; GFX10-NEXT:    s_clause 0x5
1126; GFX10-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:2
1127; GFX10-NEXT:    global_load_ubyte v3, v0, s[2:3] offset:3
1128; GFX10-NEXT:    global_load_short_d16 v2, v0, s[2:3] offset:4
1129; GFX10-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:6
1130; GFX10-NEXT:    global_load_ubyte v5, v0, s[2:3] offset:1
1131; GFX10-NEXT:    global_load_ubyte v7, v0, s[2:3]
1132; GFX10-NEXT:    s_waitcnt vmcnt(4)
1133; GFX10-NEXT:    v_lshl_or_b32 v0, v3, 8, v1
1134; GFX10-NEXT:    s_waitcnt vmcnt(2)
1135; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v6, v4
1136; GFX10-NEXT:    s_waitcnt vmcnt(1)
1137; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v1, v5
1138; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v5, v2
1139; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1140; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v4, v2
1141; GFX10-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
1142; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
1143; GFX10-NEXT:    s_waitcnt vmcnt(0)
1144; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v7
1145; GFX10-NEXT:    global_store_dwordx3 v8, v[4:6], s[0:1] offset:16
1146; GFX10-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
1147; GFX10-NEXT:    s_endpgm
1148  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1149  %gep = getelementptr <7 x i8>, <7 x i8> addrspace(1)* %in, i32 %tid
1150  %load = load <7 x i8>, <7 x i8> addrspace(1)* %gep, align 1
1151  %cvt = uitofp <7 x i8> %load to <7 x float>
1152  store <7 x float> %cvt, <7 x float> addrspace(1)* %out, align 16
1153  ret void
1154}
1155
1156define amdgpu_kernel void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind {
1157; SI-LABEL: load_v8i8_to_v8f32:
1158; SI:       ; %bb.0:
1159; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1160; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1161; SI-NEXT:    s_mov_b32 s7, 0xf000
1162; SI-NEXT:    s_mov_b32 s2, 0
1163; SI-NEXT:    s_mov_b32 s3, s7
1164; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
1165; SI-NEXT:    v_mov_b32_e32 v1, 0
1166; SI-NEXT:    s_waitcnt lgkmcnt(0)
1167; SI-NEXT:    buffer_load_dwordx2 v[7:8], v[0:1], s[0:3], 0 addr64
1168; SI-NEXT:    s_mov_b32 s6, -1
1169; SI-NEXT:    s_waitcnt vmcnt(0)
1170; SI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v7
1171; SI-NEXT:    v_cvt_f32_ubyte2_e32 v2, v7
1172; SI-NEXT:    v_cvt_f32_ubyte1_e32 v1, v7
1173; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v7
1174; SI-NEXT:    v_cvt_f32_ubyte3_e32 v7, v8
1175; SI-NEXT:    v_cvt_f32_ubyte2_e32 v6, v8
1176; SI-NEXT:    v_cvt_f32_ubyte1_e32 v5, v8
1177; SI-NEXT:    v_cvt_f32_ubyte0_e32 v4, v8
1178; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
1179; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1180; SI-NEXT:    s_endpgm
1181;
1182; VI-LABEL: load_v8i8_to_v8f32:
1183; VI:       ; %bb.0:
1184; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1185; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
1186; VI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
1187; VI-NEXT:    s_mov_b32 s7, 0xf000
1188; VI-NEXT:    s_mov_b32 s6, -1
1189; VI-NEXT:    s_waitcnt lgkmcnt(0)
1190; VI-NEXT:    v_mov_b32_e32 v1, s1
1191; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
1192; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1193; VI-NEXT:    flat_load_dwordx2 v[7:8], v[0:1]
1194; VI-NEXT:    s_waitcnt vmcnt(0)
1195; VI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v7
1196; VI-NEXT:    v_cvt_f32_ubyte2_e32 v2, v7
1197; VI-NEXT:    v_cvt_f32_ubyte1_e32 v1, v7
1198; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v7
1199; VI-NEXT:    v_cvt_f32_ubyte3_e32 v7, v8
1200; VI-NEXT:    v_cvt_f32_ubyte2_e32 v6, v8
1201; VI-NEXT:    v_cvt_f32_ubyte1_e32 v5, v8
1202; VI-NEXT:    v_cvt_f32_ubyte0_e32 v4, v8
1203; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
1204; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1205; VI-NEXT:    s_endpgm
1206;
1207; GFX10-LABEL: load_v8i8_to_v8f32:
1208; GFX10:       ; %bb.0:
1209; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1210; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
1211; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1212; GFX10-NEXT:    v_mov_b32_e32 v10, 0
1213; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1214; GFX10-NEXT:    global_load_dwordx2 v[8:9], v0, s[2:3]
1215; GFX10-NEXT:    s_waitcnt vmcnt(0)
1216; GFX10-NEXT:    v_cvt_f32_ubyte3_e32 v7, v9
1217; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v6, v9
1218; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v5, v9
1219; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v4, v9
1220; GFX10-NEXT:    v_cvt_f32_ubyte3_e32 v3, v8
1221; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v2, v8
1222; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v1, v8
1223; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v8
1224; GFX10-NEXT:    global_store_dwordx4 v10, v[4:7], s[0:1] offset:16
1225; GFX10-NEXT:    global_store_dwordx4 v10, v[0:3], s[0:1]
1226; GFX10-NEXT:    s_endpgm
1227  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1228  %gep = getelementptr <8 x i8>, <8 x i8> addrspace(1)* %in, i32 %tid
1229  %load = load <8 x i8>, <8 x i8> addrspace(1)* %gep, align 8
1230  %cvt = uitofp <8 x i8> %load to <8 x float>
1231  store <8 x float> %cvt, <8 x float> addrspace(1)* %out, align 16
1232  ret void
1233}
1234
1235define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
1236; SI-LABEL: i8_zext_inreg_i32_to_f32:
1237; SI:       ; %bb.0:
1238; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1239; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1240; SI-NEXT:    s_mov_b32 s7, 0xf000
1241; SI-NEXT:    s_mov_b32 s2, 0
1242; SI-NEXT:    s_mov_b32 s3, s7
1243; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1244; SI-NEXT:    v_mov_b32_e32 v1, 0
1245; SI-NEXT:    s_waitcnt lgkmcnt(0)
1246; SI-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
1247; SI-NEXT:    s_mov_b32 s6, -1
1248; SI-NEXT:    s_waitcnt vmcnt(0)
1249; SI-NEXT:    v_add_i32_e32 v0, vcc, 2, v0
1250; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
1251; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1252; SI-NEXT:    s_endpgm
1253;
1254; VI-LABEL: i8_zext_inreg_i32_to_f32:
1255; VI:       ; %bb.0:
1256; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1257; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
1258; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1259; VI-NEXT:    s_mov_b32 s7, 0xf000
1260; VI-NEXT:    s_mov_b32 s6, -1
1261; VI-NEXT:    s_waitcnt lgkmcnt(0)
1262; VI-NEXT:    v_mov_b32_e32 v1, s1
1263; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
1264; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1265; VI-NEXT:    flat_load_dword v0, v[0:1]
1266; VI-NEXT:    s_waitcnt vmcnt(0)
1267; VI-NEXT:    v_add_u32_e32 v0, vcc, 2, v0
1268; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
1269; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1270; VI-NEXT:    s_endpgm
1271;
1272; GFX10-LABEL: i8_zext_inreg_i32_to_f32:
1273; GFX10:       ; %bb.0:
1274; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1275; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1276; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1277; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1278; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1279; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
1280; GFX10-NEXT:    s_waitcnt vmcnt(0)
1281; GFX10-NEXT:    v_add_nc_u32_e32 v0, 2, v0
1282; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
1283; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
1284; GFX10-NEXT:    s_endpgm
1285  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1286  %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
1287  %load = load i32, i32 addrspace(1)* %gep, align 4
1288  %add = add i32 %load, 2
1289  %inreg = and i32 %add, 255
1290  %cvt = uitofp i32 %inreg to float
1291  store float %cvt, float addrspace(1)* %out, align 4
1292  ret void
1293}
1294
1295define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
1296; SI-LABEL: i8_zext_inreg_hi1_to_f32:
1297; SI:       ; %bb.0:
1298; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1299; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1300; SI-NEXT:    s_mov_b32 s7, 0xf000
1301; SI-NEXT:    s_mov_b32 s2, 0
1302; SI-NEXT:    s_mov_b32 s3, s7
1303; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1304; SI-NEXT:    v_mov_b32_e32 v1, 0
1305; SI-NEXT:    s_waitcnt lgkmcnt(0)
1306; SI-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
1307; SI-NEXT:    s_mov_b32 s6, -1
1308; SI-NEXT:    s_waitcnt vmcnt(0)
1309; SI-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
1310; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1311; SI-NEXT:    s_endpgm
1312;
1313; VI-LABEL: i8_zext_inreg_hi1_to_f32:
1314; VI:       ; %bb.0:
1315; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1316; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
1317; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1318; VI-NEXT:    s_mov_b32 s7, 0xf000
1319; VI-NEXT:    s_mov_b32 s6, -1
1320; VI-NEXT:    s_waitcnt lgkmcnt(0)
1321; VI-NEXT:    v_mov_b32_e32 v1, s1
1322; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
1323; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1324; VI-NEXT:    flat_load_dword v0, v[0:1]
1325; VI-NEXT:    s_waitcnt vmcnt(0)
1326; VI-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
1327; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1328; VI-NEXT:    s_endpgm
1329;
1330; GFX10-LABEL: i8_zext_inreg_hi1_to_f32:
1331; GFX10:       ; %bb.0:
1332; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1333; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1334; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1335; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1336; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1337; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
1338; GFX10-NEXT:    s_waitcnt vmcnt(0)
1339; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
1340; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
1341; GFX10-NEXT:    s_endpgm
1342  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1343  %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
1344  %load = load i32, i32 addrspace(1)* %gep, align 4
1345  %inreg = and i32 %load, 65280
1346  %shr = lshr i32 %inreg, 8
1347  %cvt = uitofp i32 %shr to float
1348  store float %cvt, float addrspace(1)* %out, align 4
1349  ret void
1350}
1351
1352; We don't get these ones because of the zext, but instcombine removes
1353; them so it shouldn't really matter.
1354define amdgpu_kernel void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
1355; SI-LABEL: i8_zext_i32_to_f32:
1356; SI:       ; %bb.0:
1357; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1358; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1359; SI-NEXT:    s_mov_b32 s7, 0xf000
1360; SI-NEXT:    v_mov_b32_e32 v1, 0
1361; SI-NEXT:    s_mov_b32 s2, 0
1362; SI-NEXT:    s_mov_b32 s3, s7
1363; SI-NEXT:    s_waitcnt lgkmcnt(0)
1364; SI-NEXT:    buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64
1365; SI-NEXT:    s_mov_b32 s6, -1
1366; SI-NEXT:    s_waitcnt vmcnt(0)
1367; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
1368; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1369; SI-NEXT:    s_endpgm
1370;
1371; VI-LABEL: i8_zext_i32_to_f32:
1372; VI:       ; %bb.0:
1373; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1374; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
1375; VI-NEXT:    s_mov_b32 s7, 0xf000
1376; VI-NEXT:    s_mov_b32 s6, -1
1377; VI-NEXT:    s_waitcnt lgkmcnt(0)
1378; VI-NEXT:    v_mov_b32_e32 v1, s1
1379; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
1380; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1381; VI-NEXT:    flat_load_ubyte v0, v[0:1]
1382; VI-NEXT:    s_waitcnt vmcnt(0)
1383; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
1384; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1385; VI-NEXT:    s_endpgm
1386;
1387; GFX10-LABEL: i8_zext_i32_to_f32:
1388; GFX10:       ; %bb.0:
1389; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1390; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1391; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1392; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1393; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3]
1394; GFX10-NEXT:    s_waitcnt vmcnt(0)
1395; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
1396; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
1397; GFX10-NEXT:    s_endpgm
1398  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1399  %gep = getelementptr i8, i8 addrspace(1)* %in, i32 %tid
1400  %load = load i8, i8 addrspace(1)* %gep, align 1
1401  %ext = zext i8 %load to i32
1402  %cvt = uitofp i32 %ext to float
1403  store float %cvt, float addrspace(1)* %out, align 4
1404  ret void
1405}
1406
1407define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
1408; SI-LABEL: v4i8_zext_v4i32_to_v4f32:
1409; SI:       ; %bb.0:
1410; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1411; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1412; SI-NEXT:    s_mov_b32 s7, 0xf000
1413; SI-NEXT:    s_mov_b32 s2, 0
1414; SI-NEXT:    s_mov_b32 s3, s7
1415; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1416; SI-NEXT:    v_mov_b32_e32 v1, 0
1417; SI-NEXT:    s_waitcnt lgkmcnt(0)
1418; SI-NEXT:    buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64
1419; SI-NEXT:    buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64 offset:1
1420; SI-NEXT:    buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:2
1421; SI-NEXT:    buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:3
1422; SI-NEXT:    s_mov_b32 s6, -1
1423; SI-NEXT:    s_waitcnt vmcnt(2)
1424; SI-NEXT:    v_cvt_f32_ubyte2_e32 v1, v2
1425; SI-NEXT:    s_waitcnt vmcnt(0)
1426; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
1427; SI-NEXT:    v_or_b32_e32 v0, v0, v3
1428; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1429; SI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
1430; SI-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
1431; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v4
1432; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1433; SI-NEXT:    s_endpgm
1434;
1435; VI-LABEL: v4i8_zext_v4i32_to_v4f32:
1436; VI:       ; %bb.0:
1437; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1438; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
1439; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1440; VI-NEXT:    s_mov_b32 s7, 0xf000
1441; VI-NEXT:    s_mov_b32 s6, -1
1442; VI-NEXT:    s_waitcnt lgkmcnt(0)
1443; VI-NEXT:    v_mov_b32_e32 v1, s1
1444; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
1445; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1446; VI-NEXT:    v_add_u32_e32 v2, vcc, 3, v0
1447; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
1448; VI-NEXT:    v_add_u32_e32 v4, vcc, 2, v0
1449; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
1450; VI-NEXT:    v_add_u32_e32 v6, vcc, 1, v0
1451; VI-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
1452; VI-NEXT:    flat_load_ubyte v4, v[4:5]
1453; VI-NEXT:    flat_load_ubyte v5, v[6:7]
1454; VI-NEXT:    flat_load_ubyte v2, v[2:3]
1455; VI-NEXT:    flat_load_ubyte v0, v[0:1]
1456; VI-NEXT:    s_waitcnt vmcnt(1)
1457; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v2
1458; VI-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1459; VI-NEXT:    s_waitcnt vmcnt(0)
1460; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
1461; VI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v1
1462; VI-NEXT:    v_cvt_f32_ubyte2_e32 v2, v1
1463; VI-NEXT:    v_cvt_f32_ubyte2_e32 v1, v5
1464; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1465; VI-NEXT:    s_endpgm
1466;
1467; GFX10-LABEL: v4i8_zext_v4i32_to_v4f32:
1468; GFX10:       ; %bb.0:
1469; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1470; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1471; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1472; GFX10-NEXT:    v_mov_b32_e32 v5, 0
1473; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1474; GFX10-NEXT:    s_clause 0x3
1475; GFX10-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:3
1476; GFX10-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:2
1477; GFX10-NEXT:    global_load_ubyte v3, v0, s[2:3] offset:1
1478; GFX10-NEXT:    global_load_ubyte v4, v0, s[2:3]
1479; GFX10-NEXT:    s_waitcnt vmcnt(2)
1480; GFX10-NEXT:    v_lshl_or_b32 v0, v1, 8, v2
1481; GFX10-NEXT:    s_waitcnt vmcnt(1)
1482; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v1, v3
1483; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1484; GFX10-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
1485; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
1486; GFX10-NEXT:    s_waitcnt vmcnt(0)
1487; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v4
1488; GFX10-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1]
1489; GFX10-NEXT:    s_endpgm
1490  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1491  %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid
1492  %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 1
1493  %ext = zext <4 x i8> %load to <4 x i32>
1494  %cvt = uitofp <4 x i32> %ext to <4 x float>
1495  store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
1496  ret void
1497}
1498
1499define amdgpu_kernel void @extract_byte0_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
1500; SI-LABEL: extract_byte0_to_f32:
1501; SI:       ; %bb.0:
1502; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1503; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1504; SI-NEXT:    s_mov_b32 s7, 0xf000
1505; SI-NEXT:    s_mov_b32 s2, 0
1506; SI-NEXT:    s_mov_b32 s3, s7
1507; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1508; SI-NEXT:    v_mov_b32_e32 v1, 0
1509; SI-NEXT:    s_waitcnt lgkmcnt(0)
1510; SI-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
1511; SI-NEXT:    s_mov_b32 s6, -1
1512; SI-NEXT:    s_waitcnt vmcnt(0)
1513; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
1514; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1515; SI-NEXT:    s_endpgm
1516;
1517; VI-LABEL: extract_byte0_to_f32:
1518; VI:       ; %bb.0:
1519; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1520; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
1521; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1522; VI-NEXT:    s_mov_b32 s7, 0xf000
1523; VI-NEXT:    s_mov_b32 s6, -1
1524; VI-NEXT:    s_waitcnt lgkmcnt(0)
1525; VI-NEXT:    v_mov_b32_e32 v1, s1
1526; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
1527; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1528; VI-NEXT:    flat_load_dword v0, v[0:1]
1529; VI-NEXT:    s_waitcnt vmcnt(0)
1530; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
1531; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1532; VI-NEXT:    s_endpgm
1533;
1534; GFX10-LABEL: extract_byte0_to_f32:
1535; GFX10:       ; %bb.0:
1536; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1537; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1538; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1539; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1540; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1541; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
1542; GFX10-NEXT:    s_waitcnt vmcnt(0)
1543; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
1544; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
1545; GFX10-NEXT:    s_endpgm
1546  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1547  %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
1548  %val = load i32, i32 addrspace(1)* %gep
1549  %and = and i32 %val, 255
1550  %cvt = uitofp i32 %and to float
1551  store float %cvt, float addrspace(1)* %out
1552  ret void
1553}
1554
1555define amdgpu_kernel void @extract_byte1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
1556; SI-LABEL: extract_byte1_to_f32:
1557; SI:       ; %bb.0:
1558; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1559; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1560; SI-NEXT:    s_mov_b32 s7, 0xf000
1561; SI-NEXT:    s_mov_b32 s2, 0
1562; SI-NEXT:    s_mov_b32 s3, s7
1563; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1564; SI-NEXT:    v_mov_b32_e32 v1, 0
1565; SI-NEXT:    s_waitcnt lgkmcnt(0)
1566; SI-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
1567; SI-NEXT:    s_mov_b32 s6, -1
1568; SI-NEXT:    s_waitcnt vmcnt(0)
1569; SI-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
1570; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1571; SI-NEXT:    s_endpgm
1572;
1573; VI-LABEL: extract_byte1_to_f32:
1574; VI:       ; %bb.0:
1575; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1576; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
1577; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1578; VI-NEXT:    s_mov_b32 s7, 0xf000
1579; VI-NEXT:    s_mov_b32 s6, -1
1580; VI-NEXT:    s_waitcnt lgkmcnt(0)
1581; VI-NEXT:    v_mov_b32_e32 v1, s1
1582; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
1583; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1584; VI-NEXT:    flat_load_dword v0, v[0:1]
1585; VI-NEXT:    s_waitcnt vmcnt(0)
1586; VI-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
1587; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1588; VI-NEXT:    s_endpgm
1589;
1590; GFX10-LABEL: extract_byte1_to_f32:
1591; GFX10:       ; %bb.0:
1592; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1593; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1594; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1595; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1596; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1597; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
1598; GFX10-NEXT:    s_waitcnt vmcnt(0)
1599; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
1600; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
1601; GFX10-NEXT:    s_endpgm
1602  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1603  %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
1604  %val = load i32, i32 addrspace(1)* %gep
1605  %srl = lshr i32 %val, 8
1606  %and = and i32 %srl, 255
1607  %cvt = uitofp i32 %and to float
1608  store float %cvt, float addrspace(1)* %out
1609  ret void
1610}
1611
1612define amdgpu_kernel void @extract_byte2_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
1613; SI-LABEL: extract_byte2_to_f32:
1614; SI:       ; %bb.0:
1615; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1616; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1617; SI-NEXT:    s_mov_b32 s7, 0xf000
1618; SI-NEXT:    s_mov_b32 s2, 0
1619; SI-NEXT:    s_mov_b32 s3, s7
1620; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1621; SI-NEXT:    v_mov_b32_e32 v1, 0
1622; SI-NEXT:    s_waitcnt lgkmcnt(0)
1623; SI-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
1624; SI-NEXT:    s_mov_b32 s6, -1
1625; SI-NEXT:    s_waitcnt vmcnt(0)
1626; SI-NEXT:    v_cvt_f32_ubyte2_e32 v0, v0
1627; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1628; SI-NEXT:    s_endpgm
1629;
1630; VI-LABEL: extract_byte2_to_f32:
1631; VI:       ; %bb.0:
1632; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1633; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
1634; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1635; VI-NEXT:    s_mov_b32 s7, 0xf000
1636; VI-NEXT:    s_mov_b32 s6, -1
1637; VI-NEXT:    s_waitcnt lgkmcnt(0)
1638; VI-NEXT:    v_mov_b32_e32 v1, s1
1639; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
1640; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1641; VI-NEXT:    flat_load_dword v0, v[0:1]
1642; VI-NEXT:    s_waitcnt vmcnt(0)
1643; VI-NEXT:    v_cvt_f32_ubyte2_e32 v0, v0
1644; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1645; VI-NEXT:    s_endpgm
1646;
1647; GFX10-LABEL: extract_byte2_to_f32:
1648; GFX10:       ; %bb.0:
1649; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1650; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1651; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1652; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1653; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1654; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
1655; GFX10-NEXT:    s_waitcnt vmcnt(0)
1656; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v0, v0
1657; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
1658; GFX10-NEXT:    s_endpgm
1659  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1660  %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
1661  %val = load i32, i32 addrspace(1)* %gep
1662  %srl = lshr i32 %val, 16
1663  %and = and i32 %srl, 255
1664  %cvt = uitofp i32 %and to float
1665  store float %cvt, float addrspace(1)* %out
1666  ret void
1667}
1668
1669define amdgpu_kernel void @extract_byte3_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
1670; SI-LABEL: extract_byte3_to_f32:
1671; SI:       ; %bb.0:
1672; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1673; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1674; SI-NEXT:    s_mov_b32 s7, 0xf000
1675; SI-NEXT:    s_mov_b32 s2, 0
1676; SI-NEXT:    s_mov_b32 s3, s7
1677; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1678; SI-NEXT:    v_mov_b32_e32 v1, 0
1679; SI-NEXT:    s_waitcnt lgkmcnt(0)
1680; SI-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
1681; SI-NEXT:    s_mov_b32 s6, -1
1682; SI-NEXT:    s_waitcnt vmcnt(0)
1683; SI-NEXT:    v_cvt_f32_ubyte3_e32 v0, v0
1684; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1685; SI-NEXT:    s_endpgm
1686;
1687; VI-LABEL: extract_byte3_to_f32:
1688; VI:       ; %bb.0:
1689; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1690; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
1691; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1692; VI-NEXT:    s_mov_b32 s7, 0xf000
1693; VI-NEXT:    s_mov_b32 s6, -1
1694; VI-NEXT:    s_waitcnt lgkmcnt(0)
1695; VI-NEXT:    v_mov_b32_e32 v1, s1
1696; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
1697; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1698; VI-NEXT:    flat_load_dword v0, v[0:1]
1699; VI-NEXT:    s_waitcnt vmcnt(0)
1700; VI-NEXT:    v_cvt_f32_ubyte3_e32 v0, v0
1701; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1702; VI-NEXT:    s_endpgm
1703;
1704; GFX10-LABEL: extract_byte3_to_f32:
1705; GFX10:       ; %bb.0:
1706; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1707; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1708; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1709; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1710; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1711; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
1712; GFX10-NEXT:    s_waitcnt vmcnt(0)
1713; GFX10-NEXT:    v_cvt_f32_ubyte3_e32 v0, v0
1714; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
1715; GFX10-NEXT:    s_endpgm
1716  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1717  %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
1718  %val = load i32, i32 addrspace(1)* %gep
1719  %srl = lshr i32 %val, 24
1720  %and = and i32 %srl, 255
1721  %cvt = uitofp i32 %and to float
1722  store float %cvt, float addrspace(1)* %out
1723  ret void
1724}
1725
1726define amdgpu_kernel void @cvt_ubyte0_or_multiuse(i32 addrspace(1)* %in, float addrspace(1)* %out) {
1727; SI-LABEL: cvt_ubyte0_or_multiuse:
1728; SI:       ; %bb.0: ; %bb
1729; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1730; SI-NEXT:    s_mov_b32 s3, 0xf000
1731; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1732; SI-NEXT:    v_mov_b32_e32 v1, 0
1733; SI-NEXT:    s_mov_b32 s2, -1
1734; SI-NEXT:    s_waitcnt lgkmcnt(0)
1735; SI-NEXT:    s_mov_b32 s0, s6
1736; SI-NEXT:    s_mov_b32 s1, s7
1737; SI-NEXT:    s_mov_b32 s6, 0
1738; SI-NEXT:    s_mov_b32 s7, s3
1739; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1740; SI-NEXT:    s_waitcnt vmcnt(0)
1741; SI-NEXT:    v_or_b32_e32 v0, 0x80000001, v0
1742; SI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v0
1743; SI-NEXT:    v_add_f32_e32 v0, v0, v1
1744; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1745; SI-NEXT:    s_endpgm
1746;
1747; VI-LABEL: cvt_ubyte0_or_multiuse:
1748; VI:       ; %bb.0: ; %bb
1749; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1750; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1751; VI-NEXT:    s_mov_b32 s3, 0xf000
1752; VI-NEXT:    s_mov_b32 s2, -1
1753; VI-NEXT:    s_waitcnt lgkmcnt(0)
1754; VI-NEXT:    v_mov_b32_e32 v1, s5
1755; VI-NEXT:    v_add_u32_e32 v0, vcc, s4, v0
1756; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1757; VI-NEXT:    flat_load_dword v0, v[0:1]
1758; VI-NEXT:    s_mov_b32 s0, s6
1759; VI-NEXT:    s_mov_b32 s1, s7
1760; VI-NEXT:    s_waitcnt vmcnt(0)
1761; VI-NEXT:    v_or_b32_e32 v0, 0x80000001, v0
1762; VI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v0
1763; VI-NEXT:    v_add_f32_e32 v0, v0, v1
1764; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1765; VI-NEXT:    s_endpgm
1766;
1767; GFX10-LABEL: cvt_ubyte0_or_multiuse:
1768; GFX10:       ; %bb.0: ; %bb
1769; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1770; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1771; GFX10-NEXT:    v_mov_b32_e32 v2, 0
1772; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1773; GFX10-NEXT:    global_load_dword v0, v0, s[0:1]
1774; GFX10-NEXT:    s_waitcnt vmcnt(0)
1775; GFX10-NEXT:    v_or_b32_e32 v0, 0x80000001, v0
1776; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v1, v0
1777; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
1778; GFX10-NEXT:    global_store_dword v2, v0, s[2:3]
1779; GFX10-NEXT:    s_endpgm
1780bb:
1781  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
1782  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %lid
1783  %load = load i32, i32 addrspace(1)* %gep
1784  %or = or i32 %load, -2147483647
1785  %and = and i32 %or, 255
1786  %uitofp = uitofp i32 %and to float
1787  %cast = bitcast i32 %or to float
1788  %add = fadd float %cast, %uitofp
1789  store float %add, float addrspace(1)* %out
1790  ret void
1791}
1792