1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s 3; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s 4 5declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 6declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone 7 8define float @v_uitofp_i32_to_f32_mask255(i32 %arg0) nounwind { 9; SI-LABEL: v_uitofp_i32_to_f32_mask255: 10; SI: ; %bb.0: 11; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 13; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 14; SI-NEXT: s_setpc_b64 s[30:31] 15; 16; VI-LABEL: v_uitofp_i32_to_f32_mask255: 17; VI: ; %bb.0: 18; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 19; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 20; VI-NEXT: s_setpc_b64 s[30:31] 21 %masked = and i32 %arg0, 255 22 %cvt = uitofp i32 %masked to float 23 ret float %cvt 24} 25 26define float @v_sitofp_i32_to_f32_mask255(i32 %arg0) nounwind { 27; SI-LABEL: v_sitofp_i32_to_f32_mask255: 28; SI: ; %bb.0: 29; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 30; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 31; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 32; SI-NEXT: s_setpc_b64 s[30:31] 33; 34; VI-LABEL: v_sitofp_i32_to_f32_mask255: 35; VI: ; %bb.0: 36; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 37; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 38; VI-NEXT: s_setpc_b64 s[30:31] 39 %masked = and i32 %arg0, 255 40 %cvt = sitofp i32 %masked to float 41 ret float %cvt 42} 43 44define float @v_uitofp_to_f32_lshr7_mask255(i32 %arg0) nounwind { 45; SI-LABEL: v_uitofp_to_f32_lshr7_mask255: 46; SI: ; %bb.0: 47; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 48; SI-NEXT: v_lshrrev_b32_e32 v0, 7, v0 49; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 50; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 51; SI-NEXT: s_setpc_b64 s[30:31] 52; 53; VI-LABEL: v_uitofp_to_f32_lshr7_mask255: 54; VI: ; %bb.0: 55; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 56; VI-NEXT: v_lshrrev_b32_e32 v0, 7, v0 57; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 58; VI-NEXT: s_setpc_b64 s[30:31] 59 %lshr.7 = lshr i32 %arg0, 7 60 %masked = and i32 %lshr.7, 255 61 %cvt = uitofp i32 %masked to float 62 ret float %cvt 63} 64 65define float @v_uitofp_to_f32_lshr8_mask255(i32 %arg0) nounwind { 66; SI-LABEL: v_uitofp_to_f32_lshr8_mask255: 67; SI: ; %bb.0: 68; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 69; SI-NEXT: v_lshrrev_b32_e32 v0, 8, v0 70; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 71; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 72; SI-NEXT: s_setpc_b64 s[30:31] 73; 74; VI-LABEL: v_uitofp_to_f32_lshr8_mask255: 75; VI: ; %bb.0: 76; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 77; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v0 78; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 79; VI-NEXT: s_setpc_b64 s[30:31] 80 %lshr.8 = lshr i32 %arg0, 8 81 %masked = and i32 %lshr.8, 255 82 %cvt = uitofp i32 %masked to float 83 ret float %cvt 84} 85 86define float @v_uitofp_to_f32_multi_use_lshr8_mask255(i32 %arg0) nounwind { 87; SI-LABEL: v_uitofp_to_f32_multi_use_lshr8_mask255: 88; SI: ; %bb.0: 89; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 90; SI-NEXT: v_lshrrev_b32_e32 v0, 8, v0 91; SI-NEXT: s_mov_b32 s6, -1 92; SI-NEXT: s_mov_b32 s7, 0xf000 93; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 94; SI-NEXT: s_waitcnt expcnt(0) 95; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 96; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 97; SI-NEXT: s_waitcnt vmcnt(0) 98; SI-NEXT: s_setpc_b64 s[30:31] 99; 100; VI-LABEL: v_uitofp_to_f32_multi_use_lshr8_mask255: 101; VI: ; %bb.0: 102; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 103; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v0 104; VI-NEXT: flat_store_dword v[0:1], v0 105; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 106; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 107; VI-NEXT: s_setpc_b64 s[30:31] 108 %lshr.8 = lshr i32 %arg0, 8 109 store i32 %lshr.8, i32 addrspace(1)* undef 110 %masked = and i32 %lshr.8, 255 111 %cvt = uitofp i32 %masked to float 112 ret float %cvt 113} 114 115define float @v_uitofp_to_f32_lshr16_mask255(i32 %arg0) nounwind { 116; SI-LABEL: v_uitofp_to_f32_lshr16_mask255: 117; SI: ; %bb.0: 118; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 119; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 120; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 121; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 122; SI-NEXT: s_setpc_b64 s[30:31] 123; 124; VI-LABEL: v_uitofp_to_f32_lshr16_mask255: 125; VI: ; %bb.0: 126; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 127; VI-NEXT: v_mov_b32_e32 v1, 0xff 128; VI-NEXT: v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 129; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 130; VI-NEXT: s_setpc_b64 s[30:31] 131 %lshr.16 = lshr i32 %arg0, 16 132 %masked = and i32 %lshr.16, 255 133 %cvt = uitofp i32 %masked to float 134 ret float %cvt 135} 136 137define float @v_uitofp_to_f32_lshr24_mask255(i32 %arg0) nounwind { 138; GCN-LABEL: v_uitofp_to_f32_lshr24_mask255: 139; GCN: ; %bb.0: 140; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 141; GCN-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 142; GCN-NEXT: s_setpc_b64 s[30:31] 143 %lshr.16 = lshr i32 %arg0, 24 144 %masked = and i32 %lshr.16, 255 145 %cvt = uitofp i32 %masked to float 146 ret float %cvt 147} 148 149define float @v_uitofp_i8_to_f32(i8 %arg0) nounwind { 150; SI-LABEL: v_uitofp_i8_to_f32: 151; SI: ; %bb.0: 152; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 153; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 154; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 155; SI-NEXT: s_setpc_b64 s[30:31] 156; 157; VI-LABEL: v_uitofp_i8_to_f32: 158; VI: ; %bb.0: 159; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 160; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 161; VI-NEXT: s_setpc_b64 s[30:31] 162 %cvt = uitofp i8 %arg0 to float 163 ret float %cvt 164} 165 166define <2 x float> @v_uitofp_v2i8_to_v2f32(i16 %arg0) nounwind { 167; SI-LABEL: v_uitofp_v2i8_to_v2f32: 168; SI: ; %bb.0: 169; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 170; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 171; SI-NEXT: s_movk_i32 s4, 0xff 172; SI-NEXT: v_and_b32_e32 v0, s4, v0 173; SI-NEXT: v_and_b32_e32 v1, s4, v1 174; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 175; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 176; SI-NEXT: s_setpc_b64 s[30:31] 177; 178; VI-LABEL: v_uitofp_v2i8_to_v2f32: 179; VI: ; %bb.0: 180; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 181; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 182; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 183; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 184; VI-NEXT: s_setpc_b64 s[30:31] 185 %val = bitcast i16 %arg0 to <2 x i8> 186 %cvt = uitofp <2 x i8> %val to <2 x float> 187 ret <2 x float> %cvt 188} 189 190define <3 x float> @v_uitofp_v3i8_to_v3f32(i32 %arg0) nounwind { 191; SI-LABEL: v_uitofp_v3i8_to_v3f32: 192; SI: ; %bb.0: 193; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 194; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 195; SI-NEXT: s_movk_i32 s4, 0xff 196; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 197; SI-NEXT: v_and_b32_e32 v0, s4, v0 198; SI-NEXT: v_and_b32_e32 v1, s4, v1 199; SI-NEXT: v_and_b32_e32 v2, s4, v2 200; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 201; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 202; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 203; SI-NEXT: s_setpc_b64 s[30:31] 204; 205; VI-LABEL: v_uitofp_v3i8_to_v3f32: 206; VI: ; %bb.0: 207; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 208; VI-NEXT: s_movk_i32 s4, 0xff 209; VI-NEXT: v_mov_b32_e32 v2, s4 210; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 211; VI-NEXT: v_cvt_f32_ubyte0_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 212; VI-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 213; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v0 214; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 215; VI-NEXT: v_mov_b32_e32 v0, v3 216; VI-NEXT: s_setpc_b64 s[30:31] 217 %trunc = trunc i32 %arg0 to i24 218 %val = bitcast i24 %trunc to <3 x i8> 219 %cvt = uitofp <3 x i8> %val to <3 x float> 220 ret <3 x float> %cvt 221} 222 223define <4 x float> @v_uitofp_v4i8_to_v4f32(i32 %arg0) nounwind { 224; SI-LABEL: v_uitofp_v4i8_to_v4f32: 225; SI: ; %bb.0: 226; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 227; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 228; SI-NEXT: s_movk_i32 s4, 0xff 229; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 230; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v0 231; SI-NEXT: v_and_b32_e32 v0, s4, v0 232; SI-NEXT: v_and_b32_e32 v1, s4, v1 233; SI-NEXT: v_and_b32_e32 v2, s4, v2 234; SI-NEXT: v_and_b32_e32 v3, s4, v3 235; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 236; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 237; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 238; SI-NEXT: v_cvt_f32_ubyte0_e32 v3, v3 239; SI-NEXT: s_setpc_b64 s[30:31] 240; 241; VI-LABEL: v_uitofp_v4i8_to_v4f32: 242; VI: ; %bb.0: 243; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 244; VI-NEXT: s_movk_i32 s4, 0xff 245; VI-NEXT: v_mov_b32_e32 v3, s4 246; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 247; VI-NEXT: v_and_b32_sdwa v2, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 248; VI-NEXT: v_cvt_f32_ubyte0_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 249; VI-NEXT: v_and_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD 250; VI-NEXT: v_cvt_f32_ubyte0_e32 v3, v0 251; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 252; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 253; VI-NEXT: v_mov_b32_e32 v0, v4 254; VI-NEXT: s_setpc_b64 s[30:31] 255 %val = bitcast i32 %arg0 to <4 x i8> 256 %cvt = uitofp <4 x i8> %val to <4 x float> 257 ret <4 x float> %cvt 258} 259 260define <4 x float> @v_uitofp_unpack_i32_to_v4f32(i32 %arg0) nounwind { 261; SI-LABEL: v_uitofp_unpack_i32_to_v4f32: 262; SI: ; %bb.0: 263; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 264; SI-NEXT: s_movk_i32 s4, 0xff 265; SI-NEXT: v_and_b32_e32 v1, s4, v0 266; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 267; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v1 268; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 269; SI-NEXT: v_and_b32_e32 v1, s4, v1 270; SI-NEXT: v_and_b32_e32 v2, s4, v2 271; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 272; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 273; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 274; SI-NEXT: v_mov_b32_e32 v0, v4 275; SI-NEXT: s_setpc_b64 s[30:31] 276; 277; VI-LABEL: v_uitofp_unpack_i32_to_v4f32: 278; VI: ; %bb.0: 279; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 280; VI-NEXT: s_movk_i32 s4, 0xff 281; VI-NEXT: v_mov_b32_e32 v2, s4 282; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 283; VI-NEXT: v_and_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 284; VI-NEXT: v_cvt_f32_ubyte0_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 285; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 286; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 287; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 288; VI-NEXT: v_mov_b32_e32 v0, v4 289; VI-NEXT: s_setpc_b64 s[30:31] 290 %mask.arg0 = and i32 %arg0, 255 291 %cvt0 = uitofp i32 %mask.arg0 to float 292 293 %lshr.8 = lshr i32 %arg0, 8 294 %mask.lshr.8 = and i32 %lshr.8, 255 295 %cvt1 = uitofp i32 %mask.lshr.8 to float 296 297 %lshr.16 = lshr i32 %arg0, 16 298 %mask.lshr.16 = and i32 %lshr.16, 255 299 %cvt2 = uitofp i32 %mask.lshr.16 to float 300 301 %lshr.24 = lshr i32 %arg0, 24 302 %mask.lshr.24 = and i32 %lshr.24, 255 303 %cvt3 = uitofp i32 %mask.lshr.24 to float 304 305 %ins.0 = insertelement <4 x float> undef, float %cvt0, i32 0 306 %ins.1 = insertelement <4 x float> %ins.0, float %cvt1, i32 1 307 %ins.2 = insertelement <4 x float> %ins.1, float %cvt2, i32 2 308 %ins.3 = insertelement <4 x float> %ins.2, float %cvt3, i32 3 309 ret <4 x float> %ins.3 310} 311 312define half @v_uitofp_i32_to_f16_mask255(i32 %arg0) nounwind { 313; SI-LABEL: v_uitofp_i32_to_f16_mask255: 314; SI: ; %bb.0: 315; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 316; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 317; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 318; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 319; SI-NEXT: s_setpc_b64 s[30:31] 320; 321; VI-LABEL: v_uitofp_i32_to_f16_mask255: 322; VI: ; %bb.0: 323; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 324; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 325; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 326; VI-NEXT: s_setpc_b64 s[30:31] 327 %masked = and i32 %arg0, 255 328 %cvt = uitofp i32 %masked to half 329 ret half %cvt 330} 331 332define half @v_sitofp_i32_to_f16_mask255(i32 %arg0) nounwind { 333; SI-LABEL: v_sitofp_i32_to_f16_mask255: 334; SI: ; %bb.0: 335; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 336; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 337; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 338; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 339; SI-NEXT: s_setpc_b64 s[30:31] 340; 341; VI-LABEL: v_sitofp_i32_to_f16_mask255: 342; VI: ; %bb.0: 343; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 344; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 345; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 346; VI-NEXT: s_setpc_b64 s[30:31] 347 %masked = and i32 %arg0, 255 348 %cvt = sitofp i32 %masked to half 349 ret half %cvt 350} 351 352define half @v_uitofp_to_f16_lshr8_mask255(i32 %arg0) nounwind { 353; SI-LABEL: v_uitofp_to_f16_lshr8_mask255: 354; SI: ; %bb.0: 355; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 356; SI-NEXT: v_lshrrev_b32_e32 v0, 8, v0 357; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 358; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 359; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 360; SI-NEXT: s_setpc_b64 s[30:31] 361; 362; VI-LABEL: v_uitofp_to_f16_lshr8_mask255: 363; VI: ; %bb.0: 364; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 365; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v0 366; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 367; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 368; VI-NEXT: s_setpc_b64 s[30:31] 369 %lshr.8 = lshr i32 %arg0, 8 370 %masked = and i32 %lshr.8, 255 371 %cvt = uitofp i32 %masked to half 372 ret half %cvt 373} 374 375define half @v_uitofp_to_f16_lshr16_mask255(i32 %arg0) nounwind { 376; SI-LABEL: v_uitofp_to_f16_lshr16_mask255: 377; SI: ; %bb.0: 378; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 379; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 380; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 381; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 382; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 383; SI-NEXT: s_setpc_b64 s[30:31] 384; 385; VI-LABEL: v_uitofp_to_f16_lshr16_mask255: 386; VI: ; %bb.0: 387; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 388; VI-NEXT: v_mov_b32_e32 v1, 0xff 389; VI-NEXT: v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 390; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 391; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 392; VI-NEXT: s_setpc_b64 s[30:31] 393 %lshr.16 = lshr i32 %arg0, 16 394 %masked = and i32 %lshr.16, 255 395 %cvt = uitofp i32 %masked to half 396 ret half %cvt 397} 398 399define half @v_uitofp_to_f16_lshr24_mask255(i32 %arg0) nounwind { 400; GCN-LABEL: v_uitofp_to_f16_lshr24_mask255: 401; GCN: ; %bb.0: 402; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 403; GCN-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 404; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 405; GCN-NEXT: s_setpc_b64 s[30:31] 406 %lshr.16 = lshr i32 %arg0, 24 407 %masked = and i32 %lshr.16, 255 408 %cvt = uitofp i32 %masked to half 409 ret half %cvt 410} 411 412define half @v_uitofp_i8_to_f16(i8 %arg0) nounwind { 413; SI-LABEL: v_uitofp_i8_to_f16: 414; SI: ; %bb.0: 415; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 416; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 417; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 418; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 419; SI-NEXT: s_setpc_b64 s[30:31] 420; 421; VI-LABEL: v_uitofp_i8_to_f16: 422; VI: ; %bb.0: 423; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 424; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 425; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 426; VI-NEXT: s_setpc_b64 s[30:31] 427 %cvt = uitofp i8 %arg0 to half 428 ret half %cvt 429} 430 431define double @v_uitofp_i32_to_f64_mask255(i32 %arg0) nounwind { 432; GCN-LABEL: v_uitofp_i32_to_f64_mask255: 433; GCN: ; %bb.0: 434; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 435; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 436; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 437; GCN-NEXT: s_setpc_b64 s[30:31] 438 %masked = and i32 %arg0, 255 439 %cvt = uitofp i32 %masked to double 440 ret double %cvt 441} 442 443define double @v_uitofp_to_f64_lshr8_mask255(i32 %arg0) nounwind { 444; GCN-LABEL: v_uitofp_to_f64_lshr8_mask255: 445; GCN: ; %bb.0: 446; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 447; GCN-NEXT: v_lshrrev_b32_e32 v0, 8, v0 448; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 449; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 450; GCN-NEXT: s_setpc_b64 s[30:31] 451 %lshr.8 = lshr i32 %arg0, 8 452 %masked = and i32 %lshr.8, 255 453 %cvt = uitofp i32 %masked to double 454 ret double %cvt 455} 456 457define double @v_uitofp_to_f64_lshr16_mask255(i32 %arg0) nounwind { 458; SI-LABEL: v_uitofp_to_f64_lshr16_mask255: 459; SI: ; %bb.0: 460; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 461; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 462; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 463; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 464; SI-NEXT: s_setpc_b64 s[30:31] 465; 466; VI-LABEL: v_uitofp_to_f64_lshr16_mask255: 467; VI: ; %bb.0: 468; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 469; VI-NEXT: v_mov_b32_e32 v1, 0xff 470; VI-NEXT: v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 471; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 472; VI-NEXT: s_setpc_b64 s[30:31] 473 %lshr.16 = lshr i32 %arg0, 16 474 %masked = and i32 %lshr.16, 255 475 %cvt = uitofp i32 %masked to double 476 ret double %cvt 477} 478 479define double @v_uitofp_to_f64_lshr24_mask255(i32 %arg0) nounwind { 480; GCN-LABEL: v_uitofp_to_f64_lshr24_mask255: 481; GCN: ; %bb.0: 482; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 483; GCN-NEXT: v_lshrrev_b32_e32 v0, 24, v0 484; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 485; GCN-NEXT: s_setpc_b64 s[30:31] 486 %lshr.16 = lshr i32 %arg0, 24 487 %masked = and i32 %lshr.16, 255 488 %cvt = uitofp i32 %masked to double 489 ret double %cvt 490} 491 492define double @v_uitofp_i8_to_f64(i8 %arg0) nounwind { 493; GCN-LABEL: v_uitofp_i8_to_f64: 494; GCN: ; %bb.0: 495; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 496; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 497; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 498; GCN-NEXT: s_setpc_b64 s[30:31] 499 %cvt = uitofp i8 %arg0 to double 500 ret double %cvt 501} 502 503define amdgpu_kernel void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind { 504; SI-LABEL: load_i8_to_f32: 505; SI: ; %bb.0: 506; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 507; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 508; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 509; SI-NEXT: s_mov_b32 s2, 0 510; SI-NEXT: s_mov_b32 s3, 0xf000 511; SI-NEXT: s_waitcnt lgkmcnt(0) 512; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 513; SI-NEXT: s_mov_b32 s2, -1 514; SI-NEXT: s_mov_b64 s[6:7], s[2:3] 515; SI-NEXT: s_waitcnt vmcnt(0) 516; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 517; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 518; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 519; SI-NEXT: s_endpgm 520; 521; VI-LABEL: load_i8_to_f32: 522; VI: ; %bb.0: 523; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 524; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 525; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v0 526; VI-NEXT: s_waitcnt lgkmcnt(0) 527; VI-NEXT: v_mov_b32_e32 v2, s1 528; VI-NEXT: v_mov_b32_e32 v1, s0 529; VI-NEXT: v_add_u32_e32 v0, vcc, v1, v0 530; VI-NEXT: v_addc_u32_e32 v1, vcc, v2, v3, vcc 531; VI-NEXT: flat_load_ubyte v0, v[0:1] 532; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 533; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 534; VI-NEXT: v_mov_b32_e32 v0, s2 535; VI-NEXT: v_mov_b32_e32 v1, s3 536; VI-NEXT: flat_store_dword v[0:1], v2 537; VI-NEXT: s_endpgm 538 %tid = call i32 @llvm.amdgcn.workitem.id.x() 539 %gep = getelementptr i8, i8 addrspace(1)* %in, i32 %tid 540 %load = load i8, i8 addrspace(1)* %gep, align 1 541 %cvt = uitofp i8 %load to float 542 store float %cvt, float addrspace(1)* %out, align 4 543 ret void 544} 545 546; FIXME: 547; define amdgpu_kernel void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> addrspace(1)* noalias %in) nounwind { 548; %tid = call i32 @llvm.amdgcn.workitem.id.x() 549; %gep = getelementptr <2 x i8>, <2 x i8> addrspace(1)* %in, i32 %tid 550; %load = load <2 x i8>, <2 x i8> addrspace(1)* %gep, align 2 551; %cvt = uitofp <2 x i8> %load to <2 x float> 552; store <2 x float> %cvt, <2 x float> addrspace(1)* %out, align 16 553; ret void 554; } 555 556; FIXME: 557; define amdgpu_kernel void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind { 558; %tid = call i32 @llvm.amdgcn.workitem.id.x() 559; %gep = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %in, i32 %tid 560; %load = load <3 x i8>, <3 x i8> addrspace(1)* %gep, align 4 561; %cvt = uitofp <3 x i8> %load to <3 x float> 562; store <3 x float> %cvt, <3 x float> addrspace(1)* %out, align 16 563; ret void 564; } 565 566; define amdgpu_kernel void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind { 567; %tid = call i32 @llvm.amdgcn.workitem.id.x() 568; %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid 569; %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4 570; %cvt = uitofp <4 x i8> %load to <4 x float> 571; store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16 572; ret void 573; } 574 575; This should not be adding instructions to shift into the correct 576; position in the word for the component. 577 578; FIXME: Packing bytes 579define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind { 580; SI-LABEL: load_v4i8_to_v4f32_unaligned: 581; SI: ; %bb.0: 582; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 583; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 584; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 585; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 586; SI-NEXT: s_mov_b32 s2, 0 587; SI-NEXT: s_mov_b32 s3, 0xf000 588; SI-NEXT: s_waitcnt lgkmcnt(0) 589; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64 590; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:1 591; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 offset:2 592; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:3 593; SI-NEXT: s_movk_i32 s6, 0xff 594; SI-NEXT: s_mov_b32 s2, -1 595; SI-NEXT: s_waitcnt vmcnt(3) 596; SI-NEXT: v_and_b32_e32 v1, s6, v2 597; SI-NEXT: s_waitcnt vmcnt(2) 598; SI-NEXT: v_and_b32_e32 v2, s6, v3 599; SI-NEXT: s_waitcnt vmcnt(1) 600; SI-NEXT: v_and_b32_e32 v3, s6, v4 601; SI-NEXT: s_waitcnt vmcnt(0) 602; SI-NEXT: v_and_b32_e32 v4, s6, v0 603; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v1 604; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v2 605; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v3 606; SI-NEXT: v_cvt_f32_ubyte0_e32 v3, v4 607; SI-NEXT: s_mov_b64 s[6:7], s[2:3] 608; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 609; SI-NEXT: s_endpgm 610; 611; VI-LABEL: load_v4i8_to_v4f32_unaligned: 612; VI: ; %bb.0: 613; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 614; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 615; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 616; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] 617; VI-NEXT: s_waitcnt lgkmcnt(0) 618; VI-NEXT: v_mov_b32_e32 v3, s1 619; VI-NEXT: v_mov_b32_e32 v2, s0 620; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 621; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc 622; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v0 623; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 624; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v0 625; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 626; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v0 627; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc 628; VI-NEXT: flat_load_ubyte v0, v[0:1] 629; VI-NEXT: flat_load_ubyte v1, v[2:3] 630; VI-NEXT: flat_load_ubyte v2, v[4:5] 631; VI-NEXT: flat_load_ubyte v3, v[6:7] 632; VI-NEXT: v_mov_b32_e32 v5, s3 633; VI-NEXT: v_mov_b32_e32 v4, s2 634; VI-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) 635; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 636; VI-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) 637; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 638; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) 639; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 640; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 641; VI-NEXT: v_cvt_f32_ubyte0_sdwa v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 642; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 643; VI-NEXT: s_endpgm 644 %tid = call i32 @llvm.amdgcn.workitem.id.x() 645 %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid 646 %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 1 647 %cvt = uitofp <4 x i8> %load to <4 x float> 648 store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16 649 ret void 650} 651 652; FIXME: Need to handle non-uniform case for function below (load without gep). 653; Instructions still emitted to repack bytes for add use. 654; define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind { 655; %tid.x = call i32 @llvm.amdgcn.workitem.id.x() 656; %in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x 657; %load = load <4 x i8>, <4 x i8> addrspace(1)* %in.ptr, align 4 658; %cvt = uitofp <4 x i8> %load to <4 x float> 659; store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16 660; %add = add <4 x i8> %load, <i8 9, i8 9, i8 9, i8 9> ; Second use of %load 661; store <4 x i8> %add, <4 x i8> addrspace(1)* %out2, align 4 662; ret void 663; } 664 665; Make sure this doesn't crash. 666; FIXME: 667; define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8> addrspace(1)* noalias %in) nounwind { 668; %tid = call i32 @llvm.amdgcn.workitem.id.x() 669; %gep = getelementptr <7 x i8>, <7 x i8> addrspace(1)* %in, i32 %tid 670; %load = load <7 x i8>, <7 x i8> addrspace(1)* %gep, align 1 671; %cvt = uitofp <7 x i8> %load to <7 x float> 672; store <7 x float> %cvt, <7 x float> addrspace(1)* %out, align 16 673; ret void 674; } 675 676; FIXME 677; define amdgpu_kernel void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind { 678; %tid = call i32 @llvm.amdgcn.workitem.id.x() 679; %gep = getelementptr <8 x i8>, <8 x i8> addrspace(1)* %in, i32 %tid 680; %load = load <8 x i8>, <8 x i8> addrspace(1)* %gep, align 8 681; %cvt = uitofp <8 x i8> %load to <8 x float> 682; store <8 x float> %cvt, <8 x float> addrspace(1)* %out, align 16 683; ret void 684; } 685 686define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { 687; SI-LABEL: i8_zext_inreg_i32_to_f32: 688; SI: ; %bb.0: 689; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 690; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 691; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 692; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 693; SI-NEXT: s_mov_b32 s2, 0 694; SI-NEXT: s_mov_b32 s3, 0xf000 695; SI-NEXT: s_waitcnt lgkmcnt(0) 696; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 697; SI-NEXT: s_mov_b32 s2, -1 698; SI-NEXT: s_mov_b64 s[6:7], s[2:3] 699; SI-NEXT: s_waitcnt vmcnt(0) 700; SI-NEXT: v_add_i32_e32 v0, vcc, 2, v0 701; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 702; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 703; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 704; SI-NEXT: s_endpgm 705; 706; VI-LABEL: i8_zext_inreg_i32_to_f32: 707; VI: ; %bb.0: 708; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 709; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 710; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 711; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] 712; VI-NEXT: s_waitcnt lgkmcnt(0) 713; VI-NEXT: v_mov_b32_e32 v3, s1 714; VI-NEXT: v_mov_b32_e32 v2, s0 715; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 716; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc 717; VI-NEXT: flat_load_dword v0, v[0:1] 718; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 719; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0 720; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 721; VI-NEXT: v_mov_b32_e32 v0, s2 722; VI-NEXT: v_mov_b32_e32 v1, s3 723; VI-NEXT: flat_store_dword v[0:1], v2 724; VI-NEXT: s_endpgm 725 %tid = call i32 @llvm.amdgcn.workitem.id.x() 726 %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid 727 %load = load i32, i32 addrspace(1)* %gep, align 4 728 %add = add i32 %load, 2 729 %inreg = and i32 %add, 255 730 %cvt = uitofp i32 %inreg to float 731 store float %cvt, float addrspace(1)* %out, align 4 732 ret void 733} 734 735define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { 736; SI-LABEL: i8_zext_inreg_hi1_to_f32: 737; SI: ; %bb.0: 738; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 739; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 740; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 741; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 742; SI-NEXT: s_mov_b32 s2, 0 743; SI-NEXT: s_mov_b32 s3, 0xf000 744; SI-NEXT: s_waitcnt lgkmcnt(0) 745; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 746; SI-NEXT: s_mov_b32 s2, -1 747; SI-NEXT: s_mov_b64 s[6:7], s[2:3] 748; SI-NEXT: s_waitcnt vmcnt(0) 749; SI-NEXT: v_and_b32_e32 v0, 0xff00, v0 750; SI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 751; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 752; SI-NEXT: s_endpgm 753; 754; VI-LABEL: i8_zext_inreg_hi1_to_f32: 755; VI: ; %bb.0: 756; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 757; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 758; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 759; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] 760; VI-NEXT: s_waitcnt lgkmcnt(0) 761; VI-NEXT: v_mov_b32_e32 v3, s1 762; VI-NEXT: v_mov_b32_e32 v2, s0 763; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 764; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc 765; VI-NEXT: flat_load_dword v0, v[0:1] 766; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 767; VI-NEXT: v_and_b32_e32 v0, 0xff00, v0 768; VI-NEXT: v_cvt_f32_ubyte1_e32 v2, v0 769; VI-NEXT: v_mov_b32_e32 v0, s2 770; VI-NEXT: v_mov_b32_e32 v1, s3 771; VI-NEXT: flat_store_dword v[0:1], v2 772; VI-NEXT: s_endpgm 773 %tid = call i32 @llvm.amdgcn.workitem.id.x() 774 %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid 775 %load = load i32, i32 addrspace(1)* %gep, align 4 776 %inreg = and i32 %load, 65280 777 %shr = lshr i32 %inreg, 8 778 %cvt = uitofp i32 %shr to float 779 store float %cvt, float addrspace(1)* %out, align 4 780 ret void 781} 782 783; We don't get these ones because of the zext, but instcombine removes 784; them so it shouldn't really matter. 785define amdgpu_kernel void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind { 786; SI-LABEL: i8_zext_i32_to_f32: 787; SI: ; %bb.0: 788; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 789; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 790; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 791; SI-NEXT: s_mov_b32 s2, 0 792; SI-NEXT: s_mov_b32 s3, 0xf000 793; SI-NEXT: s_waitcnt lgkmcnt(0) 794; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 795; SI-NEXT: s_mov_b32 s2, -1 796; SI-NEXT: s_mov_b64 s[6:7], s[2:3] 797; SI-NEXT: s_waitcnt vmcnt(0) 798; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 799; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 800; SI-NEXT: s_endpgm 801; 802; VI-LABEL: i8_zext_i32_to_f32: 803; VI: ; %bb.0: 804; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 805; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 806; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v0 807; VI-NEXT: s_waitcnt lgkmcnt(0) 808; VI-NEXT: v_mov_b32_e32 v2, s1 809; VI-NEXT: v_mov_b32_e32 v1, s0 810; VI-NEXT: v_add_u32_e32 v0, vcc, v1, v0 811; VI-NEXT: v_addc_u32_e32 v1, vcc, v2, v3, vcc 812; VI-NEXT: flat_load_ubyte v0, v[0:1] 813; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 814; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v0 815; VI-NEXT: v_mov_b32_e32 v0, s2 816; VI-NEXT: v_mov_b32_e32 v1, s3 817; VI-NEXT: flat_store_dword v[0:1], v2 818; VI-NEXT: s_endpgm 819 %tid = call i32 @llvm.amdgcn.workitem.id.x() 820 %gep = getelementptr i8, i8 addrspace(1)* %in, i32 %tid 821 %load = load i8, i8 addrspace(1)* %gep, align 1 822 %ext = zext i8 %load to i32 823 %cvt = uitofp i32 %ext to float 824 store float %cvt, float addrspace(1)* %out, align 4 825 ret void 826} 827 828define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind { 829; SI-LABEL: v4i8_zext_v4i32_to_v4f32: 830; SI: ; %bb.0: 831; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 832; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 833; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 834; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 835; SI-NEXT: s_mov_b32 s2, 0 836; SI-NEXT: s_mov_b32 s3, 0xf000 837; SI-NEXT: s_waitcnt lgkmcnt(0) 838; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64 839; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:1 840; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 offset:2 841; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:3 842; SI-NEXT: s_movk_i32 s6, 0xff 843; SI-NEXT: s_mov_b32 s2, -1 844; SI-NEXT: s_waitcnt vmcnt(3) 845; SI-NEXT: v_and_b32_e32 v1, s6, v2 846; SI-NEXT: s_waitcnt vmcnt(2) 847; SI-NEXT: v_and_b32_e32 v2, s6, v3 848; SI-NEXT: s_waitcnt vmcnt(1) 849; SI-NEXT: v_and_b32_e32 v3, s6, v4 850; SI-NEXT: s_waitcnt vmcnt(0) 851; SI-NEXT: v_and_b32_e32 v4, s6, v0 852; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v1 853; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v2 854; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v3 855; SI-NEXT: v_cvt_f32_ubyte0_e32 v3, v4 856; SI-NEXT: s_mov_b64 s[6:7], s[2:3] 857; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 858; SI-NEXT: s_endpgm 859; 860; VI-LABEL: v4i8_zext_v4i32_to_v4f32: 861; VI: ; %bb.0: 862; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 863; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 864; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 865; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] 866; VI-NEXT: s_waitcnt lgkmcnt(0) 867; VI-NEXT: v_mov_b32_e32 v3, s1 868; VI-NEXT: v_mov_b32_e32 v2, s0 869; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 870; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc 871; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v0 872; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 873; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v0 874; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 875; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v0 876; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc 877; VI-NEXT: flat_load_ubyte v0, v[0:1] 878; VI-NEXT: flat_load_ubyte v1, v[2:3] 879; VI-NEXT: flat_load_ubyte v2, v[4:5] 880; VI-NEXT: flat_load_ubyte v3, v[6:7] 881; VI-NEXT: v_mov_b32_e32 v5, s3 882; VI-NEXT: v_mov_b32_e32 v4, s2 883; VI-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) 884; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 885; VI-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) 886; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 887; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) 888; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 889; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 890; VI-NEXT: v_cvt_f32_ubyte0_sdwa v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 891; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 892; VI-NEXT: s_endpgm 893 %tid = call i32 @llvm.amdgcn.workitem.id.x() 894 %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid 895 %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 1 896 %ext = zext <4 x i8> %load to <4 x i32> 897 %cvt = uitofp <4 x i32> %ext to <4 x float> 898 store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16 899 ret void 900} 901 902define amdgpu_kernel void @extract_byte0_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { 903; SI-LABEL: extract_byte0_to_f32: 904; SI: ; %bb.0: 905; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 906; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 907; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 908; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 909; SI-NEXT: s_mov_b32 s2, 0 910; SI-NEXT: s_mov_b32 s3, 0xf000 911; SI-NEXT: s_waitcnt lgkmcnt(0) 912; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 913; SI-NEXT: s_mov_b32 s2, -1 914; SI-NEXT: s_mov_b64 s[6:7], s[2:3] 915; SI-NEXT: s_waitcnt vmcnt(0) 916; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 917; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 918; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 919; SI-NEXT: s_endpgm 920; 921; VI-LABEL: extract_byte0_to_f32: 922; VI: ; %bb.0: 923; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 924; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 925; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 926; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] 927; VI-NEXT: s_waitcnt lgkmcnt(0) 928; VI-NEXT: v_mov_b32_e32 v3, s1 929; VI-NEXT: v_mov_b32_e32 v2, s0 930; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 931; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc 932; VI-NEXT: flat_load_dword v0, v[0:1] 933; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 934; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 935; VI-NEXT: v_mov_b32_e32 v0, s2 936; VI-NEXT: v_mov_b32_e32 v1, s3 937; VI-NEXT: flat_store_dword v[0:1], v2 938; VI-NEXT: s_endpgm 939 %tid = call i32 @llvm.amdgcn.workitem.id.x() 940 %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid 941 %val = load i32, i32 addrspace(1)* %gep 942 %and = and i32 %val, 255 943 %cvt = uitofp i32 %and to float 944 store float %cvt, float addrspace(1)* %out 945 ret void 946} 947 948define amdgpu_kernel void @extract_byte1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { 949; SI-LABEL: extract_byte1_to_f32: 950; SI: ; %bb.0: 951; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 952; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 953; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 954; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 955; SI-NEXT: s_mov_b32 s2, 0 956; SI-NEXT: s_mov_b32 s3, 0xf000 957; SI-NEXT: s_waitcnt lgkmcnt(0) 958; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 959; SI-NEXT: s_mov_b32 s2, -1 960; SI-NEXT: s_mov_b64 s[6:7], s[2:3] 961; SI-NEXT: s_waitcnt vmcnt(0) 962; SI-NEXT: v_lshrrev_b32_e32 v0, 8, v0 963; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 964; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 965; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 966; SI-NEXT: s_endpgm 967; 968; VI-LABEL: extract_byte1_to_f32: 969; VI: ; %bb.0: 970; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 971; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 972; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 973; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] 974; VI-NEXT: s_waitcnt lgkmcnt(0) 975; VI-NEXT: v_mov_b32_e32 v3, s1 976; VI-NEXT: v_mov_b32_e32 v2, s0 977; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 978; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc 979; VI-NEXT: flat_load_dword v0, v[0:1] 980; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 981; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v0 982; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 983; VI-NEXT: v_mov_b32_e32 v0, s2 984; VI-NEXT: v_mov_b32_e32 v1, s3 985; VI-NEXT: flat_store_dword v[0:1], v2 986; VI-NEXT: s_endpgm 987 %tid = call i32 @llvm.amdgcn.workitem.id.x() 988 %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid 989 %val = load i32, i32 addrspace(1)* %gep 990 %srl = lshr i32 %val, 8 991 %and = and i32 %srl, 255 992 %cvt = uitofp i32 %and to float 993 store float %cvt, float addrspace(1)* %out 994 ret void 995} 996 997define amdgpu_kernel void @extract_byte2_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { 998; SI-LABEL: extract_byte2_to_f32: 999; SI: ; %bb.0: 1000; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1001; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1002; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 1003; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 1004; SI-NEXT: s_mov_b32 s2, 0 1005; SI-NEXT: s_mov_b32 s3, 0xf000 1006; SI-NEXT: s_waitcnt lgkmcnt(0) 1007; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 1008; SI-NEXT: s_mov_b32 s2, -1 1009; SI-NEXT: s_mov_b64 s[6:7], s[2:3] 1010; SI-NEXT: s_waitcnt vmcnt(0) 1011; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1012; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 1013; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 1014; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1015; SI-NEXT: s_endpgm 1016; 1017; VI-LABEL: extract_byte2_to_f32: 1018; VI: ; %bb.0: 1019; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1020; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 1021; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 1022; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] 1023; VI-NEXT: s_waitcnt lgkmcnt(0) 1024; VI-NEXT: v_mov_b32_e32 v3, s1 1025; VI-NEXT: v_mov_b32_e32 v2, s0 1026; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 1027; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc 1028; VI-NEXT: flat_load_dword v0, v[0:1] 1029; VI-NEXT: v_mov_b32_e32 v1, 0xff 1030; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1031; VI-NEXT: v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 1032; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v0 1033; VI-NEXT: v_mov_b32_e32 v0, s2 1034; VI-NEXT: v_mov_b32_e32 v1, s3 1035; VI-NEXT: flat_store_dword v[0:1], v2 1036; VI-NEXT: s_endpgm 1037 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1038 %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid 1039 %val = load i32, i32 addrspace(1)* %gep 1040 %srl = lshr i32 %val, 16 1041 %and = and i32 %srl, 255 1042 %cvt = uitofp i32 %and to float 1043 store float %cvt, float addrspace(1)* %out 1044 ret void 1045} 1046 1047define amdgpu_kernel void @extract_byte3_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { 1048; SI-LABEL: extract_byte3_to_f32: 1049; SI: ; %bb.0: 1050; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1051; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1052; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 1053; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 1054; SI-NEXT: s_mov_b32 s2, 0 1055; SI-NEXT: s_mov_b32 s3, 0xf000 1056; SI-NEXT: s_waitcnt lgkmcnt(0) 1057; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 1058; SI-NEXT: s_mov_b32 s2, -1 1059; SI-NEXT: s_mov_b64 s[6:7], s[2:3] 1060; SI-NEXT: s_waitcnt vmcnt(0) 1061; SI-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 1062; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1063; SI-NEXT: s_endpgm 1064; 1065; VI-LABEL: extract_byte3_to_f32: 1066; VI: ; %bb.0: 1067; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1068; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 1069; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 1070; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] 1071; VI-NEXT: s_waitcnt lgkmcnt(0) 1072; VI-NEXT: v_mov_b32_e32 v3, s1 1073; VI-NEXT: v_mov_b32_e32 v2, s0 1074; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 1075; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc 1076; VI-NEXT: flat_load_dword v0, v[0:1] 1077; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1078; VI-NEXT: v_cvt_f32_ubyte3_e32 v2, v0 1079; VI-NEXT: v_mov_b32_e32 v0, s2 1080; VI-NEXT: v_mov_b32_e32 v1, s3 1081; VI-NEXT: flat_store_dword v[0:1], v2 1082; VI-NEXT: s_endpgm 1083 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1084 %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid 1085 %val = load i32, i32 addrspace(1)* %gep 1086 %srl = lshr i32 %val, 24 1087 %and = and i32 %srl, 255 1088 %cvt = uitofp i32 %and to float 1089 store float %cvt, float addrspace(1)* %out 1090 ret void 1091} 1092 1093define amdgpu_kernel void @cvt_ubyte0_or_multiuse(i32 addrspace(1)* %in, float addrspace(1)* %out) { 1094; SI-LABEL: cvt_ubyte0_or_multiuse: 1095; SI: ; %bb.0: ; %bb 1096; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1097; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 1098; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 1099; SI-NEXT: s_mov_b32 s6, 0 1100; SI-NEXT: s_mov_b32 s7, 0xf000 1101; SI-NEXT: s_waitcnt lgkmcnt(0) 1102; SI-NEXT: s_mov_b64 s[4:5], s[0:1] 1103; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 1104; SI-NEXT: s_mov_b32 s6, -1 1105; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 1106; SI-NEXT: s_waitcnt vmcnt(0) 1107; SI-NEXT: v_or_b32_e32 v0, 0x80000001, v0 1108; SI-NEXT: v_and_b32_e32 v1, 0xff, v0 1109; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 1110; SI-NEXT: v_add_f32_e32 v0, v0, v1 1111; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1112; SI-NEXT: s_endpgm 1113; 1114; VI-LABEL: cvt_ubyte0_or_multiuse: 1115; VI: ; %bb.0: ; %bb 1116; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1117; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 1118; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] 1119; VI-NEXT: s_waitcnt lgkmcnt(0) 1120; VI-NEXT: v_mov_b32_e32 v3, s1 1121; VI-NEXT: v_mov_b32_e32 v2, s0 1122; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 1123; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc 1124; VI-NEXT: flat_load_dword v0, v[0:1] 1125; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1126; VI-NEXT: v_or_b32_e32 v0, 0x80000001, v0 1127; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 1128; VI-NEXT: v_add_f32_e32 v2, v0, v1 1129; VI-NEXT: v_mov_b32_e32 v0, s2 1130; VI-NEXT: v_mov_b32_e32 v1, s3 1131; VI-NEXT: flat_store_dword v[0:1], v2 1132; VI-NEXT: s_endpgm 1133bb: 1134 %lid = tail call i32 @llvm.amdgcn.workitem.id.x() 1135 %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %lid 1136 %load = load i32, i32 addrspace(1)* %gep 1137 %or = or i32 %load, -2147483647 1138 %and = and i32 %or, 255 1139 %uitofp = uitofp i32 %and to float 1140 %cast = bitcast i32 %or to float 1141 %add = fadd float %cast, %uitofp 1142 store float %add, float addrspace(1)* %out 1143 ret void 1144} 1145 1146define float @v_test_sitofp_i64_byte_to_f32(i64 %arg0) { 1147; SI-LABEL: v_test_sitofp_i64_byte_to_f32: 1148; SI: ; %bb.0: 1149; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1150; SI-NEXT: s_movk_i32 s6, 0xff 1151; SI-NEXT: v_and_b32_e32 v2, s6, v0 1152; SI-NEXT: v_add_i32_e32 v2, vcc, 0, v2 1153; SI-NEXT: v_ffbh_u32_e32 v4, v2 1154; SI-NEXT: v_addc_u32_e64 v3, s[4:5], 0, 0, vcc 1155; SI-NEXT: v_add_i32_e32 v4, vcc, 32, v4 1156; SI-NEXT: v_ffbh_u32_e32 v5, v3 1157; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 1158; SI-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc 1159; SI-NEXT: v_mov_b32_e32 v5, 0xbe 1160; SI-NEXT: v_sub_i32_e32 v6, vcc, v5, v4 1161; SI-NEXT: v_lshl_b64 v[4:5], v[2:3], v4 1162; SI-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] 1163; SI-NEXT: v_and_b32_e32 v3, 0x7fffffff, v5 1164; SI-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc 1165; SI-NEXT: v_and_b32_e32 v5, s6, v3 1166; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v3 1167; SI-NEXT: v_lshlrev_b32_e32 v2, 23, v2 1168; SI-NEXT: s_mov_b32 s4, 0 1169; SI-NEXT: s_movk_i32 s5, 0x80 1170; SI-NEXT: v_or_b32_e32 v2, v2, v3 1171; SI-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[4:5] 1172; SI-NEXT: v_and_b32_e32 v3, 1, v2 1173; SI-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc 1174; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[4:5] 1175; SI-NEXT: v_mov_b32_e32 v0, 0 1176; SI-NEXT: v_cndmask_b32_e64 v3, v3, 1, vcc 1177; SI-NEXT: v_mov_b32_e32 v1, v0 1178; SI-NEXT: v_add_i32_e32 v2, vcc, v2, v3 1179; SI-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 1180; SI-NEXT: v_cndmask_b32_e64 v0, v2, -v2, vcc 1181; SI-NEXT: s_setpc_b64 s[30:31] 1182; 1183; VI-LABEL: v_test_sitofp_i64_byte_to_f32: 1184; VI: ; %bb.0: 1185; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1186; VI-NEXT: s_movk_i32 s6, 0xff 1187; VI-NEXT: v_and_b32_e32 v2, s6, v0 1188; VI-NEXT: v_add_u32_e32 v2, vcc, 0, v2 1189; VI-NEXT: v_ffbh_u32_e32 v4, v2 1190; VI-NEXT: v_addc_u32_e64 v3, s[4:5], 0, 0, vcc 1191; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v4 1192; VI-NEXT: v_ffbh_u32_e32 v5, v3 1193; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 1194; VI-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc 1195; VI-NEXT: v_mov_b32_e32 v5, 0xbe 1196; VI-NEXT: v_sub_u32_e32 v6, vcc, v5, v4 1197; VI-NEXT: v_lshlrev_b64 v[4:5], v4, v[2:3] 1198; VI-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] 1199; VI-NEXT: v_and_b32_e32 v3, 0x7fffffff, v5 1200; VI-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc 1201; VI-NEXT: v_and_b32_e32 v5, s6, v3 1202; VI-NEXT: v_lshrrev_b32_e32 v3, 8, v3 1203; VI-NEXT: v_lshlrev_b32_e32 v2, 23, v2 1204; VI-NEXT: s_mov_b32 s4, 0 1205; VI-NEXT: s_movk_i32 s5, 0x80 1206; VI-NEXT: v_or_b32_e32 v2, v2, v3 1207; VI-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[4:5] 1208; VI-NEXT: v_and_b32_e32 v3, 1, v2 1209; VI-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc 1210; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[4:5] 1211; VI-NEXT: v_mov_b32_e32 v0, 0 1212; VI-NEXT: v_cndmask_b32_e64 v3, v3, 1, vcc 1213; VI-NEXT: v_mov_b32_e32 v1, v0 1214; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v3 1215; VI-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 1216; VI-NEXT: v_cndmask_b32_e64 v0, v2, -v2, vcc 1217; VI-NEXT: s_setpc_b64 s[30:31] 1218 %masked = and i64 %arg0, 255 1219 %itofp = sitofp i64 %masked to float 1220 ret float %itofp 1221} 1222 1223define float @v_test_uitofp_i64_byte_to_f32(i64 %arg0) { 1224; SI-LABEL: v_test_uitofp_i64_byte_to_f32: 1225; SI: ; %bb.0: 1226; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1227; SI-NEXT: s_movk_i32 s4, 0xff 1228; SI-NEXT: v_and_b32_e32 v0, s4, v0 1229; SI-NEXT: v_ffbh_u32_e32 v2, v0 1230; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v2 1231; SI-NEXT: v_ffbh_u32_e32 v3, 0 1232; SI-NEXT: v_cmp_eq_u32_e64 vcc, 0, 0 1233; SI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc 1234; SI-NEXT: v_mov_b32_e32 v3, 0xbe 1235; SI-NEXT: v_mov_b32_e32 v1, 0 1236; SI-NEXT: v_sub_i32_e32 v4, vcc, v3, v2 1237; SI-NEXT: v_lshl_b64 v[2:3], v[0:1], v2 1238; SI-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 1239; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v3 1240; SI-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc 1241; SI-NEXT: v_and_b32_e32 v3, s4, v1 1242; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v1 1243; SI-NEXT: v_lshlrev_b32_e32 v0, 23, v0 1244; SI-NEXT: s_mov_b32 s4, 0 1245; SI-NEXT: s_movk_i32 s5, 0x80 1246; SI-NEXT: v_or_b32_e32 v0, v0, v1 1247; SI-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[2:3] 1248; SI-NEXT: v_and_b32_e32 v1, 1, v0 1249; SI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 1250; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] 1251; SI-NEXT: v_cndmask_b32_e64 v1, v1, 1, vcc 1252; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v1 1253; SI-NEXT: s_setpc_b64 s[30:31] 1254; 1255; VI-LABEL: v_test_uitofp_i64_byte_to_f32: 1256; VI: ; %bb.0: 1257; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1258; VI-NEXT: s_movk_i32 s4, 0xff 1259; VI-NEXT: v_and_b32_e32 v0, s4, v0 1260; VI-NEXT: v_ffbh_u32_e32 v2, v0 1261; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v2 1262; VI-NEXT: v_ffbh_u32_e32 v3, 0 1263; VI-NEXT: v_cmp_eq_u32_e64 vcc, 0, 0 1264; VI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc 1265; VI-NEXT: v_mov_b32_e32 v3, 0xbe 1266; VI-NEXT: v_mov_b32_e32 v1, 0 1267; VI-NEXT: v_sub_u32_e32 v4, vcc, v3, v2 1268; VI-NEXT: v_lshlrev_b64 v[2:3], v2, v[0:1] 1269; VI-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 1270; VI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v3 1271; VI-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc 1272; VI-NEXT: v_and_b32_e32 v3, s4, v1 1273; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v1 1274; VI-NEXT: v_lshlrev_b32_e32 v0, 23, v0 1275; VI-NEXT: s_mov_b32 s4, 0 1276; VI-NEXT: s_movk_i32 s5, 0x80 1277; VI-NEXT: v_or_b32_e32 v0, v0, v1 1278; VI-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[2:3] 1279; VI-NEXT: v_and_b32_e32 v1, 1, v0 1280; VI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 1281; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] 1282; VI-NEXT: v_cndmask_b32_e64 v1, v1, 1, vcc 1283; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v1 1284; VI-NEXT: s_setpc_b64 s[30:31] 1285 %masked = and i64 %arg0, 255 1286 %itofp = uitofp i64 %masked to float 1287 ret float %itofp 1288} 1289 1290define float @v_test_sitofp_i16_byte_to_f32(i16 %arg0) { 1291; SI-LABEL: v_test_sitofp_i16_byte_to_f32: 1292; SI: ; %bb.0: 1293; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1294; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 1295; SI-NEXT: v_bfe_i32 v0, v0, 0, 16 1296; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 1297; SI-NEXT: s_setpc_b64 s[30:31] 1298; 1299; VI-LABEL: v_test_sitofp_i16_byte_to_f32: 1300; VI: ; %bb.0: 1301; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1302; VI-NEXT: v_and_b32_e32 v0, 0xff, v0 1303; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 1304; VI-NEXT: s_setpc_b64 s[30:31] 1305 %masked = and i16 %arg0, 255 1306 %itofp = sitofp i16 %masked to float 1307 ret float %itofp 1308} 1309 1310define float @v_test_uitofp_i16_byte_to_f32(i16 %arg0) { 1311; SI-LABEL: v_test_uitofp_i16_byte_to_f32: 1312; SI: ; %bb.0: 1313; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1314; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 1315; SI-NEXT: v_bfe_u32 v0, v0, 0, 16 1316; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 1317; SI-NEXT: s_setpc_b64 s[30:31] 1318; 1319; VI-LABEL: v_test_uitofp_i16_byte_to_f32: 1320; VI: ; %bb.0: 1321; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1322; VI-NEXT: v_and_b32_e32 v0, 0xff, v0 1323; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 1324; VI-NEXT: s_setpc_b64 s[30:31] 1325 %masked = and i16 %arg0, 255 1326 %itofp = uitofp i16 %masked to float 1327 ret float %itofp 1328} 1329