1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,SI 3; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,VI 4; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10 5 6declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 7declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone 8 9define float @v_uitofp_i32_to_f32_mask255(i32 %arg0) nounwind { 10; GCN-LABEL: v_uitofp_i32_to_f32_mask255: 11; GCN: ; %bb.0: 12; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13; GCN-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 14; GCN-NEXT: s_setpc_b64 s[30:31] 15; 16; GFX10-LABEL: v_uitofp_i32_to_f32_mask255: 17; GFX10: ; %bb.0: 18; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 19; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 20; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 21; GFX10-NEXT: s_setpc_b64 s[30:31] 22 %masked = and i32 %arg0, 255 23 %cvt = uitofp i32 %masked to float 24 ret float %cvt 25} 26 27define float @v_sitofp_i32_to_f32_mask255(i32 %arg0) nounwind { 28; GCN-LABEL: v_sitofp_i32_to_f32_mask255: 29; GCN: ; %bb.0: 30; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 31; GCN-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 32; GCN-NEXT: s_setpc_b64 s[30:31] 33; 34; GFX10-LABEL: v_sitofp_i32_to_f32_mask255: 35; GFX10: ; %bb.0: 36; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 37; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 38; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 39; GFX10-NEXT: s_setpc_b64 s[30:31] 40 %masked = and i32 %arg0, 255 41 %cvt = sitofp i32 %masked to float 42 ret float %cvt 43} 44 45define float @v_uitofp_to_f32_lshr7_mask255(i32 %arg0) nounwind { 46; GCN-LABEL: v_uitofp_to_f32_lshr7_mask255: 47; GCN: ; %bb.0: 48; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 49; GCN-NEXT: v_lshrrev_b32_e32 v0, 7, v0 50; GCN-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 51; GCN-NEXT: s_setpc_b64 s[30:31] 52; 53; GFX10-LABEL: v_uitofp_to_f32_lshr7_mask255: 54; GFX10: ; %bb.0: 55; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 56; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 57; GFX10-NEXT: v_lshrrev_b32_e32 v0, 7, v0 58; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 59; GFX10-NEXT: s_setpc_b64 s[30:31] 60 %lshr.7 = lshr i32 %arg0, 7 61 %masked = and i32 %lshr.7, 255 62 %cvt = uitofp i32 %masked to float 63 ret float %cvt 64} 65 66define float @v_uitofp_to_f32_lshr8_mask255(i32 %arg0) nounwind { 67; GCN-LABEL: v_uitofp_to_f32_lshr8_mask255: 68; GCN: ; %bb.0: 69; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 70; GCN-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 71; GCN-NEXT: s_setpc_b64 s[30:31] 72; 73; GFX10-LABEL: v_uitofp_to_f32_lshr8_mask255: 74; GFX10: ; %bb.0: 75; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 76; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 77; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 78; GFX10-NEXT: s_setpc_b64 s[30:31] 79 %lshr.8 = lshr i32 %arg0, 8 80 %masked = and i32 %lshr.8, 255 81 %cvt = uitofp i32 %masked to float 82 ret float %cvt 83} 84 85define float @v_uitofp_to_f32_multi_use_lshr8_mask255(i32 %arg0) nounwind { 86; SI-LABEL: v_uitofp_to_f32_multi_use_lshr8_mask255: 87; SI: ; %bb.0: 88; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 89; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 90; SI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 91; SI-NEXT: s_mov_b32 s7, 0xf000 92; SI-NEXT: s_mov_b32 s6, -1 93; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0 94; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 95; SI-NEXT: s_setpc_b64 s[30:31] 96; 97; VI-LABEL: v_uitofp_to_f32_multi_use_lshr8_mask255: 98; VI: ; %bb.0: 99; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 100; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 101; VI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 102; VI-NEXT: s_mov_b32 s7, 0xf000 103; VI-NEXT: s_mov_b32 s6, -1 104; VI-NEXT: buffer_store_dword v1, off, s[4:7], 0 105; VI-NEXT: s_waitcnt vmcnt(0) 106; VI-NEXT: s_setpc_b64 s[30:31] 107; 108; GFX10-LABEL: v_uitofp_to_f32_multi_use_lshr8_mask255: 109; GFX10: ; %bb.0: 110; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 111; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 112; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0 113; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 114; GFX10-NEXT: global_store_dword v[0:1], v1, off 115; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 116; GFX10-NEXT: s_setpc_b64 s[30:31] 117 %lshr.8 = lshr i32 %arg0, 8 118 store i32 %lshr.8, i32 addrspace(1)* undef 119 %masked = and i32 %lshr.8, 255 120 %cvt = uitofp i32 %masked to float 121 ret float %cvt 122} 123 124define float @v_uitofp_to_f32_lshr16_mask255(i32 %arg0) nounwind { 125; GCN-LABEL: v_uitofp_to_f32_lshr16_mask255: 126; GCN: ; %bb.0: 127; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 128; GCN-NEXT: v_cvt_f32_ubyte2_e32 v0, v0 129; GCN-NEXT: s_setpc_b64 s[30:31] 130; 131; GFX10-LABEL: v_uitofp_to_f32_lshr16_mask255: 132; GFX10: ; %bb.0: 133; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 134; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 135; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v0, v0 136; GFX10-NEXT: s_setpc_b64 s[30:31] 137 %lshr.16 = lshr i32 %arg0, 16 138 %masked = and i32 %lshr.16, 255 139 %cvt = uitofp i32 %masked to float 140 ret float %cvt 141} 142 143define float @v_uitofp_to_f32_lshr24_mask255(i32 %arg0) nounwind { 144; GCN-LABEL: v_uitofp_to_f32_lshr24_mask255: 145; GCN: ; %bb.0: 146; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 147; GCN-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 148; GCN-NEXT: s_setpc_b64 s[30:31] 149; 150; GFX10-LABEL: v_uitofp_to_f32_lshr24_mask255: 151; GFX10: ; %bb.0: 152; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 153; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 154; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 155; GFX10-NEXT: s_setpc_b64 s[30:31] 156 %lshr.16 = lshr i32 %arg0, 24 157 %masked = and i32 %lshr.16, 255 158 %cvt = uitofp i32 %masked to float 159 ret float %cvt 160} 161 162define float @v_uitofp_i8_to_f32(i8 %arg0) nounwind { 163; GCN-LABEL: v_uitofp_i8_to_f32: 164; GCN: ; %bb.0: 165; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 166; GCN-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 167; GCN-NEXT: s_setpc_b64 s[30:31] 168; 169; GFX10-LABEL: v_uitofp_i8_to_f32: 170; GFX10: ; %bb.0: 171; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 172; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 173; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 174; GFX10-NEXT: s_setpc_b64 s[30:31] 175 %cvt = uitofp i8 %arg0 to float 176 ret float %cvt 177} 178 179define <2 x float> @v_uitofp_v2i8_to_v2f32(i16 %arg0) nounwind { 180; GCN-LABEL: v_uitofp_v2i8_to_v2f32: 181; GCN: ; %bb.0: 182; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 183; GCN-NEXT: v_cvt_f32_ubyte0_e32 v2, v0 184; GCN-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 185; GCN-NEXT: v_mov_b32_e32 v0, v2 186; GCN-NEXT: s_setpc_b64 s[30:31] 187; 188; GFX10-LABEL: v_uitofp_v2i8_to_v2f32: 189; GFX10: ; %bb.0: 190; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 191; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 192; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v2, v0 193; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 194; GFX10-NEXT: v_mov_b32_e32 v0, v2 195; GFX10-NEXT: s_setpc_b64 s[30:31] 196 %val = bitcast i16 %arg0 to <2 x i8> 197 %cvt = uitofp <2 x i8> %val to <2 x float> 198 ret <2 x float> %cvt 199} 200 201define <3 x float> @v_uitofp_v3i8_to_v3f32(i32 %arg0) nounwind { 202; GCN-LABEL: v_uitofp_v3i8_to_v3f32: 203; GCN: ; %bb.0: 204; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 205; GCN-NEXT: v_cvt_f32_ubyte0_e32 v3, v0 206; GCN-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 207; GCN-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 208; GCN-NEXT: v_mov_b32_e32 v0, v3 209; GCN-NEXT: s_setpc_b64 s[30:31] 210; 211; GFX10-LABEL: v_uitofp_v3i8_to_v3f32: 212; GFX10: ; %bb.0: 213; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 214; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 215; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v3, v0 216; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 217; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 218; GFX10-NEXT: v_mov_b32_e32 v0, v3 219; GFX10-NEXT: s_setpc_b64 s[30:31] 220 %trunc = trunc i32 %arg0 to i24 221 %val = bitcast i24 %trunc to <3 x i8> 222 %cvt = uitofp <3 x i8> %val to <3 x float> 223 ret <3 x float> %cvt 224} 225 226define <4 x float> @v_uitofp_v4i8_to_v4f32(i32 %arg0) nounwind { 227; GCN-LABEL: v_uitofp_v4i8_to_v4f32: 228; GCN: ; %bb.0: 229; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 230; GCN-NEXT: v_cvt_f32_ubyte0_e32 v4, v0 231; GCN-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 232; GCN-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 233; GCN-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 234; GCN-NEXT: v_mov_b32_e32 v0, v4 235; GCN-NEXT: s_setpc_b64 s[30:31] 236; 237; GFX10-LABEL: v_uitofp_v4i8_to_v4f32: 238; GFX10: ; %bb.0: 239; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 240; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 241; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v4, v0 242; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 243; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 244; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 245; GFX10-NEXT: v_mov_b32_e32 v0, v4 246; GFX10-NEXT: s_setpc_b64 s[30:31] 247 %val = bitcast i32 %arg0 to <4 x i8> 248 %cvt = uitofp <4 x i8> %val to <4 x float> 249 ret <4 x float> %cvt 250} 251 252define <4 x float> @v_uitofp_unpack_i32_to_v4f32(i32 %arg0) nounwind { 253; GCN-LABEL: v_uitofp_unpack_i32_to_v4f32: 254; GCN: ; %bb.0: 255; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 256; GCN-NEXT: v_cvt_f32_ubyte0_e32 v4, v0 257; GCN-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 258; GCN-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 259; GCN-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 260; GCN-NEXT: v_mov_b32_e32 v0, v4 261; GCN-NEXT: s_setpc_b64 s[30:31] 262; 263; GFX10-LABEL: v_uitofp_unpack_i32_to_v4f32: 264; GFX10: ; %bb.0: 265; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 266; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 267; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v4, v0 268; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 269; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 270; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 271; GFX10-NEXT: v_mov_b32_e32 v0, v4 272; GFX10-NEXT: s_setpc_b64 s[30:31] 273 %mask.arg0 = and i32 %arg0, 255 274 %cvt0 = uitofp i32 %mask.arg0 to float 275 276 %lshr.8 = lshr i32 %arg0, 8 277 %mask.lshr.8 = and i32 %lshr.8, 255 278 %cvt1 = uitofp i32 %mask.lshr.8 to float 279 280 %lshr.16 = lshr i32 %arg0, 16 281 %mask.lshr.16 = and i32 %lshr.16, 255 282 %cvt2 = uitofp i32 %mask.lshr.16 to float 283 284 %lshr.24 = lshr i32 %arg0, 24 285 %mask.lshr.24 = and i32 %lshr.24, 255 286 %cvt3 = uitofp i32 %mask.lshr.24 to float 287 288 %ins.0 = insertelement <4 x float> undef, float %cvt0, i32 0 289 %ins.1 = insertelement <4 x float> %ins.0, float %cvt1, i32 1 290 %ins.2 = insertelement <4 x float> %ins.1, float %cvt2, i32 2 291 %ins.3 = insertelement <4 x float> %ins.2, float %cvt3, i32 3 292 ret <4 x float> %ins.3 293} 294 295define half @v_uitofp_i32_to_f16_mask255(i32 %arg0) nounwind { 296; SI-LABEL: v_uitofp_i32_to_f16_mask255: 297; SI: ; %bb.0: 298; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 299; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 300; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 301; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 302; SI-NEXT: s_setpc_b64 s[30:31] 303; 304; VI-LABEL: v_uitofp_i32_to_f16_mask255: 305; VI: ; %bb.0: 306; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 307; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 308; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 309; VI-NEXT: s_setpc_b64 s[30:31] 310; 311; GFX10-LABEL: v_uitofp_i32_to_f16_mask255: 312; GFX10: ; %bb.0: 313; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 314; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 315; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 316; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 317; GFX10-NEXT: s_setpc_b64 s[30:31] 318 %masked = and i32 %arg0, 255 319 %cvt = uitofp i32 %masked to half 320 ret half %cvt 321} 322 323define half @v_sitofp_i32_to_f16_mask255(i32 %arg0) nounwind { 324; SI-LABEL: v_sitofp_i32_to_f16_mask255: 325; SI: ; %bb.0: 326; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 327; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 328; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 329; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 330; SI-NEXT: s_setpc_b64 s[30:31] 331; 332; VI-LABEL: v_sitofp_i32_to_f16_mask255: 333; VI: ; %bb.0: 334; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 335; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 336; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 337; VI-NEXT: s_setpc_b64 s[30:31] 338; 339; GFX10-LABEL: v_sitofp_i32_to_f16_mask255: 340; GFX10: ; %bb.0: 341; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 342; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 343; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 344; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 345; GFX10-NEXT: s_setpc_b64 s[30:31] 346 %masked = and i32 %arg0, 255 347 %cvt = sitofp i32 %masked to half 348 ret half %cvt 349} 350 351define half @v_uitofp_to_f16_lshr8_mask255(i32 %arg0) nounwind { 352; SI-LABEL: v_uitofp_to_f16_lshr8_mask255: 353; SI: ; %bb.0: 354; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 355; SI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 356; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 357; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 358; SI-NEXT: s_setpc_b64 s[30:31] 359; 360; VI-LABEL: v_uitofp_to_f16_lshr8_mask255: 361; VI: ; %bb.0: 362; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 363; VI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 364; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 365; VI-NEXT: s_setpc_b64 s[30:31] 366; 367; GFX10-LABEL: v_uitofp_to_f16_lshr8_mask255: 368; GFX10: ; %bb.0: 369; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 370; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 371; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 372; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 373; GFX10-NEXT: s_setpc_b64 s[30:31] 374 %lshr.8 = lshr i32 %arg0, 8 375 %masked = and i32 %lshr.8, 255 376 %cvt = uitofp i32 %masked to half 377 ret half %cvt 378} 379 380define half @v_uitofp_to_f16_lshr16_mask255(i32 %arg0) nounwind { 381; SI-LABEL: v_uitofp_to_f16_lshr16_mask255: 382; SI: ; %bb.0: 383; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 384; SI-NEXT: v_cvt_f32_ubyte2_e32 v0, v0 385; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 386; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 387; SI-NEXT: s_setpc_b64 s[30:31] 388; 389; VI-LABEL: v_uitofp_to_f16_lshr16_mask255: 390; VI: ; %bb.0: 391; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 392; VI-NEXT: v_cvt_f32_ubyte2_e32 v0, v0 393; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 394; VI-NEXT: s_setpc_b64 s[30:31] 395; 396; GFX10-LABEL: v_uitofp_to_f16_lshr16_mask255: 397; GFX10: ; %bb.0: 398; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 399; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 400; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v0, v0 401; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 402; GFX10-NEXT: s_setpc_b64 s[30:31] 403 %lshr.16 = lshr i32 %arg0, 16 404 %masked = and i32 %lshr.16, 255 405 %cvt = uitofp i32 %masked to half 406 ret half %cvt 407} 408 409define half @v_uitofp_to_f16_lshr24_mask255(i32 %arg0) nounwind { 410; SI-LABEL: v_uitofp_to_f16_lshr24_mask255: 411; SI: ; %bb.0: 412; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 413; SI-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 414; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 415; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 416; SI-NEXT: s_setpc_b64 s[30:31] 417; 418; VI-LABEL: v_uitofp_to_f16_lshr24_mask255: 419; VI: ; %bb.0: 420; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 421; VI-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 422; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 423; VI-NEXT: s_setpc_b64 s[30:31] 424; 425; GFX10-LABEL: v_uitofp_to_f16_lshr24_mask255: 426; GFX10: ; %bb.0: 427; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 428; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 429; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 430; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 431; GFX10-NEXT: s_setpc_b64 s[30:31] 432 %lshr.16 = lshr i32 %arg0, 24 433 %masked = and i32 %lshr.16, 255 434 %cvt = uitofp i32 %masked to half 435 ret half %cvt 436} 437 438define half @v_uitofp_i8_to_f16(i8 %arg0) nounwind { 439; SI-LABEL: v_uitofp_i8_to_f16: 440; SI: ; %bb.0: 441; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 442; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 443; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 444; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 445; SI-NEXT: s_setpc_b64 s[30:31] 446; 447; VI-LABEL: v_uitofp_i8_to_f16: 448; VI: ; %bb.0: 449; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 450; VI-NEXT: v_cvt_f16_u16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 451; VI-NEXT: s_setpc_b64 s[30:31] 452; 453; GFX10-LABEL: v_uitofp_i8_to_f16: 454; GFX10: ; %bb.0: 455; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 456; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 457; GFX10-NEXT: v_cvt_f16_u16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 458; GFX10-NEXT: s_setpc_b64 s[30:31] 459 %cvt = uitofp i8 %arg0 to half 460 ret half %cvt 461} 462 463define double @v_uitofp_i32_to_f64_mask255(i32 %arg0) nounwind { 464; GCN-LABEL: v_uitofp_i32_to_f64_mask255: 465; GCN: ; %bb.0: 466; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 467; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 468; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 469; GCN-NEXT: s_setpc_b64 s[30:31] 470; 471; GFX10-LABEL: v_uitofp_i32_to_f64_mask255: 472; GFX10: ; %bb.0: 473; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 474; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 475; GFX10-NEXT: v_and_b32_e32 v0, 0xff, v0 476; GFX10-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 477; GFX10-NEXT: s_setpc_b64 s[30:31] 478 %masked = and i32 %arg0, 255 479 %cvt = uitofp i32 %masked to double 480 ret double %cvt 481} 482 483define double @v_uitofp_to_f64_lshr8_mask255(i32 %arg0) nounwind { 484; GCN-LABEL: v_uitofp_to_f64_lshr8_mask255: 485; GCN: ; %bb.0: 486; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 487; GCN-NEXT: v_bfe_u32 v0, v0, 8, 8 488; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 489; GCN-NEXT: s_setpc_b64 s[30:31] 490; 491; GFX10-LABEL: v_uitofp_to_f64_lshr8_mask255: 492; GFX10: ; %bb.0: 493; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 494; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 495; GFX10-NEXT: v_bfe_u32 v0, v0, 8, 8 496; GFX10-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 497; GFX10-NEXT: s_setpc_b64 s[30:31] 498 %lshr.8 = lshr i32 %arg0, 8 499 %masked = and i32 %lshr.8, 255 500 %cvt = uitofp i32 %masked to double 501 ret double %cvt 502} 503 504define double @v_uitofp_to_f64_lshr16_mask255(i32 %arg0) nounwind { 505; GCN-LABEL: v_uitofp_to_f64_lshr16_mask255: 506; GCN: ; %bb.0: 507; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 508; GCN-NEXT: v_bfe_u32 v0, v0, 16, 8 509; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 510; GCN-NEXT: s_setpc_b64 s[30:31] 511; 512; GFX10-LABEL: v_uitofp_to_f64_lshr16_mask255: 513; GFX10: ; %bb.0: 514; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 515; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 516; GFX10-NEXT: v_bfe_u32 v0, v0, 16, 8 517; GFX10-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 518; GFX10-NEXT: s_setpc_b64 s[30:31] 519 %lshr.16 = lshr i32 %arg0, 16 520 %masked = and i32 %lshr.16, 255 521 %cvt = uitofp i32 %masked to double 522 ret double %cvt 523} 524 525define double @v_uitofp_to_f64_lshr24_mask255(i32 %arg0) nounwind { 526; GCN-LABEL: v_uitofp_to_f64_lshr24_mask255: 527; GCN: ; %bb.0: 528; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 529; GCN-NEXT: v_lshrrev_b32_e32 v0, 24, v0 530; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 531; GCN-NEXT: s_setpc_b64 s[30:31] 532; 533; GFX10-LABEL: v_uitofp_to_f64_lshr24_mask255: 534; GFX10: ; %bb.0: 535; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 536; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 537; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v0 538; GFX10-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 539; GFX10-NEXT: s_setpc_b64 s[30:31] 540 %lshr.16 = lshr i32 %arg0, 24 541 %masked = and i32 %lshr.16, 255 542 %cvt = uitofp i32 %masked to double 543 ret double %cvt 544} 545 546define double @v_uitofp_i8_to_f64(i8 %arg0) nounwind { 547; SI-LABEL: v_uitofp_i8_to_f64: 548; SI: ; %bb.0: 549; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 550; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 551; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 552; SI-NEXT: s_setpc_b64 s[30:31] 553; 554; VI-LABEL: v_uitofp_i8_to_f64: 555; VI: ; %bb.0: 556; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 557; VI-NEXT: v_mov_b32_e32 v1, 0xffff 558; VI-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 559; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 560; VI-NEXT: s_setpc_b64 s[30:31] 561; 562; GFX10-LABEL: v_uitofp_i8_to_f64: 563; GFX10: ; %bb.0: 564; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 565; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 566; GFX10-NEXT: v_mov_b32_e32 v1, 0xffff 567; GFX10-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 568; GFX10-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 569; GFX10-NEXT: s_setpc_b64 s[30:31] 570 %cvt = uitofp i8 %arg0 to double 571 ret double %cvt 572} 573 574define amdgpu_kernel void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind { 575; SI-LABEL: load_i8_to_f32: 576; SI: ; %bb.0: 577; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 578; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 579; SI-NEXT: s_mov_b32 s7, 0xf000 580; SI-NEXT: v_mov_b32_e32 v1, 0 581; SI-NEXT: s_mov_b32 s2, 0 582; SI-NEXT: s_mov_b32 s3, s7 583; SI-NEXT: s_waitcnt lgkmcnt(0) 584; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 585; SI-NEXT: s_mov_b32 s6, -1 586; SI-NEXT: s_waitcnt vmcnt(0) 587; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 588; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 589; SI-NEXT: s_endpgm 590; 591; VI-LABEL: load_i8_to_f32: 592; VI: ; %bb.0: 593; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 594; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 595; VI-NEXT: s_mov_b32 s7, 0xf000 596; VI-NEXT: s_mov_b32 s6, -1 597; VI-NEXT: s_waitcnt lgkmcnt(0) 598; VI-NEXT: v_mov_b32_e32 v1, s1 599; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 600; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 601; VI-NEXT: flat_load_ubyte v0, v[0:1] 602; VI-NEXT: s_waitcnt vmcnt(0) 603; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 604; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 605; VI-NEXT: s_endpgm 606; 607; GFX10-LABEL: load_i8_to_f32: 608; GFX10: ; %bb.0: 609; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 610; GFX10-NEXT: v_mov_b32_e32 v1, 0 611; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 612; GFX10-NEXT: s_waitcnt lgkmcnt(0) 613; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] 614; GFX10-NEXT: s_waitcnt vmcnt(0) 615; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 616; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 617; GFX10-NEXT: s_endpgm 618 %tid = call i32 @llvm.amdgcn.workitem.id.x() 619 %gep = getelementptr i8, i8 addrspace(1)* %in, i32 %tid 620 %load = load i8, i8 addrspace(1)* %gep, align 1 621 %cvt = uitofp i8 %load to float 622 store float %cvt, float addrspace(1)* %out, align 4 623 ret void 624} 625 626define amdgpu_kernel void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> addrspace(1)* noalias %in) nounwind { 627; SI-LABEL: load_v2i8_to_v2f32: 628; SI: ; %bb.0: 629; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 630; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 631; SI-NEXT: s_mov_b32 s7, 0xf000 632; SI-NEXT: s_mov_b32 s2, 0 633; SI-NEXT: s_mov_b32 s3, s7 634; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 635; SI-NEXT: v_mov_b32_e32 v1, 0 636; SI-NEXT: s_waitcnt lgkmcnt(0) 637; SI-NEXT: buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64 638; SI-NEXT: s_mov_b32 s6, -1 639; SI-NEXT: s_waitcnt vmcnt(0) 640; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 641; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 642; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 643; SI-NEXT: s_endpgm 644; 645; VI-LABEL: load_v2i8_to_v2f32: 646; VI: ; %bb.0: 647; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 648; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 649; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 650; VI-NEXT: s_mov_b32 s7, 0xf000 651; VI-NEXT: s_mov_b32 s6, -1 652; VI-NEXT: s_waitcnt lgkmcnt(0) 653; VI-NEXT: v_mov_b32_e32 v1, s1 654; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 655; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 656; VI-NEXT: flat_load_ushort v0, v[0:1] 657; VI-NEXT: s_waitcnt vmcnt(0) 658; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 659; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 660; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 661; VI-NEXT: s_endpgm 662; 663; GFX10-LABEL: load_v2i8_to_v2f32: 664; GFX10: ; %bb.0: 665; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 666; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 667; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 668; GFX10-NEXT: v_mov_b32_e32 v2, 0 669; GFX10-NEXT: s_waitcnt lgkmcnt(0) 670; GFX10-NEXT: global_load_ushort v0, v0, s[2:3] 671; GFX10-NEXT: s_waitcnt vmcnt(0) 672; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 673; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 674; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 675; GFX10-NEXT: s_endpgm 676 %tid = call i32 @llvm.amdgcn.workitem.id.x() 677 %gep = getelementptr <2 x i8>, <2 x i8> addrspace(1)* %in, i32 %tid 678 %load = load <2 x i8>, <2 x i8> addrspace(1)* %gep, align 2 679 %cvt = uitofp <2 x i8> %load to <2 x float> 680 store <2 x float> %cvt, <2 x float> addrspace(1)* %out, align 16 681 ret void 682} 683 684define amdgpu_kernel void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind { 685; SI-LABEL: load_v3i8_to_v3f32: 686; SI: ; %bb.0: 687; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 688; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 689; SI-NEXT: s_mov_b32 s7, 0xf000 690; SI-NEXT: s_mov_b32 s2, 0 691; SI-NEXT: s_mov_b32 s3, s7 692; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 693; SI-NEXT: v_mov_b32_e32 v1, 0 694; SI-NEXT: s_waitcnt lgkmcnt(0) 695; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 696; SI-NEXT: s_mov_b32 s6, -1 697; SI-NEXT: s_waitcnt vmcnt(0) 698; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v2 699; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v2 700; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v2 701; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:8 702; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 703; SI-NEXT: s_endpgm 704; 705; VI-LABEL: load_v3i8_to_v3f32: 706; VI: ; %bb.0: 707; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 708; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 709; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 710; VI-NEXT: s_mov_b32 s7, 0xf000 711; VI-NEXT: s_mov_b32 s6, -1 712; VI-NEXT: s_waitcnt lgkmcnt(0) 713; VI-NEXT: v_mov_b32_e32 v1, s1 714; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 715; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 716; VI-NEXT: flat_load_dword v0, v[0:1] 717; VI-NEXT: s_waitcnt vmcnt(0) 718; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 719; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 720; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 721; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0 722; VI-NEXT: s_endpgm 723; 724; GFX10-LABEL: load_v3i8_to_v3f32: 725; GFX10: ; %bb.0: 726; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 727; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 728; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 729; GFX10-NEXT: v_mov_b32_e32 v3, 0 730; GFX10-NEXT: s_waitcnt lgkmcnt(0) 731; GFX10-NEXT: global_load_dword v0, v0, s[2:3] 732; GFX10-NEXT: s_waitcnt vmcnt(0) 733; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 734; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 735; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 736; GFX10-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] 737; GFX10-NEXT: s_endpgm 738 %tid = call i32 @llvm.amdgcn.workitem.id.x() 739 %gep = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %in, i32 %tid 740 %load = load <3 x i8>, <3 x i8> addrspace(1)* %gep, align 4 741 %cvt = uitofp <3 x i8> %load to <3 x float> 742 store <3 x float> %cvt, <3 x float> addrspace(1)* %out, align 16 743 ret void 744} 745 746define amdgpu_kernel void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind { 747; SI-LABEL: load_v4i8_to_v4f32: 748; SI: ; %bb.0: 749; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 750; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 751; SI-NEXT: s_mov_b32 s7, 0xf000 752; SI-NEXT: s_mov_b32 s2, 0 753; SI-NEXT: s_mov_b32 s3, s7 754; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 755; SI-NEXT: v_mov_b32_e32 v1, 0 756; SI-NEXT: s_waitcnt lgkmcnt(0) 757; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 758; SI-NEXT: s_mov_b32 s6, -1 759; SI-NEXT: s_waitcnt vmcnt(0) 760; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 761; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 762; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 763; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 764; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 765; SI-NEXT: s_endpgm 766; 767; VI-LABEL: load_v4i8_to_v4f32: 768; VI: ; %bb.0: 769; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 770; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 771; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 772; VI-NEXT: s_mov_b32 s7, 0xf000 773; VI-NEXT: s_mov_b32 s6, -1 774; VI-NEXT: s_waitcnt lgkmcnt(0) 775; VI-NEXT: v_mov_b32_e32 v1, s1 776; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 777; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 778; VI-NEXT: flat_load_dword v0, v[0:1] 779; VI-NEXT: s_waitcnt vmcnt(0) 780; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 781; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 782; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 783; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 784; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 785; VI-NEXT: s_endpgm 786; 787; GFX10-LABEL: load_v4i8_to_v4f32: 788; GFX10: ; %bb.0: 789; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 790; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 791; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 792; GFX10-NEXT: v_mov_b32_e32 v4, 0 793; GFX10-NEXT: s_waitcnt lgkmcnt(0) 794; GFX10-NEXT: global_load_dword v0, v0, s[2:3] 795; GFX10-NEXT: s_waitcnt vmcnt(0) 796; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 797; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 798; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 799; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 800; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 801; GFX10-NEXT: s_endpgm 802 %tid = call i32 @llvm.amdgcn.workitem.id.x() 803 %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid 804 %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4 805 %cvt = uitofp <4 x i8> %load to <4 x float> 806 store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16 807 ret void 808} 809 810; This should not be adding instructions to shift into the correct 811; position in the word for the component. 812 813; FIXME: Packing bytes 814define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind { 815; SI-LABEL: load_v4i8_to_v4f32_unaligned: 816; SI: ; %bb.0: 817; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 818; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 819; SI-NEXT: s_mov_b32 s7, 0xf000 820; SI-NEXT: s_mov_b32 s2, 0 821; SI-NEXT: s_mov_b32 s3, s7 822; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 823; SI-NEXT: v_mov_b32_e32 v1, 0 824; SI-NEXT: s_waitcnt lgkmcnt(0) 825; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 826; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64 offset:1 827; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:2 828; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:3 829; SI-NEXT: s_mov_b32 s6, -1 830; SI-NEXT: s_waitcnt vmcnt(2) 831; SI-NEXT: v_cvt_f32_ubyte2_e32 v1, v2 832; SI-NEXT: s_waitcnt vmcnt(0) 833; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 834; SI-NEXT: v_or_b32_e32 v0, v0, v3 835; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 836; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 837; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 838; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 839; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 840; SI-NEXT: s_endpgm 841; 842; VI-LABEL: load_v4i8_to_v4f32_unaligned: 843; VI: ; %bb.0: 844; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 845; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 846; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 847; VI-NEXT: s_mov_b32 s7, 0xf000 848; VI-NEXT: s_mov_b32 s6, -1 849; VI-NEXT: s_waitcnt lgkmcnt(0) 850; VI-NEXT: v_mov_b32_e32 v1, s1 851; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 852; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 853; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v0 854; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 855; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v0 856; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 857; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v0 858; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc 859; VI-NEXT: flat_load_ubyte v4, v[4:5] 860; VI-NEXT: flat_load_ubyte v5, v[6:7] 861; VI-NEXT: flat_load_ubyte v6, v[2:3] 862; VI-NEXT: flat_load_ubyte v0, v[0:1] 863; VI-NEXT: s_waitcnt vmcnt(3) 864; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v4 865; VI-NEXT: s_waitcnt vmcnt(2) 866; VI-NEXT: v_cvt_f32_ubyte0_e32 v3, v5 867; VI-NEXT: s_waitcnt vmcnt(1) 868; VI-NEXT: v_cvt_f32_ubyte2_e32 v1, v6 869; VI-NEXT: s_waitcnt vmcnt(0) 870; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 871; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 872; VI-NEXT: s_endpgm 873; 874; GFX10-LABEL: load_v4i8_to_v4f32_unaligned: 875; GFX10: ; %bb.0: 876; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 877; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 878; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 879; GFX10-NEXT: v_mov_b32_e32 v6, 0 880; GFX10-NEXT: s_waitcnt lgkmcnt(0) 881; GFX10-NEXT: s_clause 0x3 882; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] offset:3 883; GFX10-NEXT: global_load_ubyte v2, v0, s[2:3] offset:2 884; GFX10-NEXT: global_load_ubyte v4, v0, s[2:3] offset:1 885; GFX10-NEXT: global_load_ubyte v5, v0, s[2:3] 886; GFX10-NEXT: s_waitcnt vmcnt(3) 887; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v3, v1 888; GFX10-NEXT: s_waitcnt vmcnt(2) 889; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 890; GFX10-NEXT: s_waitcnt vmcnt(1) 891; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v1, v4 892; GFX10-NEXT: s_waitcnt vmcnt(0) 893; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v5 894; GFX10-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] 895; GFX10-NEXT: s_endpgm 896 %tid = call i32 @llvm.amdgcn.workitem.id.x() 897 %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid 898 %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 1 899 %cvt = uitofp <4 x i8> %load to <4 x float> 900 store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16 901 ret void 902} 903 904; FIXME: Need to handle non-uniform case for function below (load without gep). 905; Instructions still emitted to repack bytes for add use. 906define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind { 907; SI-LABEL: load_v4i8_to_v4f32_2_uses: 908; SI: ; %bb.0: 909; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 910; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb 911; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 912; SI-NEXT: s_mov_b32 s11, 0xf000 913; SI-NEXT: s_mov_b32 s2, 0 914; SI-NEXT: s_mov_b32 s3, s11 915; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 916; SI-NEXT: v_mov_b32_e32 v1, 0 917; SI-NEXT: s_waitcnt lgkmcnt(0) 918; SI-NEXT: buffer_load_dword v4, v[0:1], s[0:3], 0 addr64 919; SI-NEXT: s_mov_b32 s10, -1 920; SI-NEXT: s_movk_i32 s0, 0xff 921; SI-NEXT: s_mov_b32 s6, s10 922; SI-NEXT: s_mov_b32 s7, s11 923; SI-NEXT: s_waitcnt vmcnt(0) 924; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 925; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v4 926; SI-NEXT: v_and_b32_e32 v7, 0xff00, v4 927; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v4 928; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v4 929; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v4 930; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 931; SI-NEXT: v_add_i32_e32 v4, vcc, 9, v4 932; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 933; SI-NEXT: s_waitcnt expcnt(0) 934; SI-NEXT: v_and_b32_e32 v0, s0, v4 935; SI-NEXT: v_add_i32_e32 v2, vcc, 9, v5 936; SI-NEXT: v_or_b32_e32 v0, v7, v0 937; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v6 938; SI-NEXT: v_and_b32_e32 v2, s0, v2 939; SI-NEXT: v_add_i32_e32 v0, vcc, 0x900, v0 940; SI-NEXT: v_or_b32_e32 v1, v1, v2 941; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 942; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 943; SI-NEXT: v_or_b32_e32 v0, v1, v0 944; SI-NEXT: v_add_i32_e32 v0, vcc, 0x9000000, v0 945; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 946; SI-NEXT: s_endpgm 947; 948; VI-LABEL: load_v4i8_to_v4f32_2_uses: 949; VI: ; %bb.0: 950; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 951; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x2c 952; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 953; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 954; VI-NEXT: s_mov_b32 s11, 0xf000 955; VI-NEXT: s_mov_b32 s10, -1 956; VI-NEXT: v_mov_b32_e32 v5, 9 957; VI-NEXT: s_waitcnt lgkmcnt(0) 958; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 959; VI-NEXT: v_mov_b32_e32 v1, s1 960; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 961; VI-NEXT: flat_load_dword v4, v[0:1] 962; VI-NEXT: s_mov_b32 s6, s10 963; VI-NEXT: s_mov_b32 s7, s11 964; VI-NEXT: s_movk_i32 s0, 0x900 965; VI-NEXT: s_waitcnt vmcnt(0) 966; VI-NEXT: v_lshrrev_b32_e32 v6, 24, v4 967; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v4 968; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v4 969; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v4 970; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 971; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 972; VI-NEXT: v_and_b32_e32 v7, 0xffffff00, v4 973; VI-NEXT: v_add_u16_e32 v8, 9, v4 974; VI-NEXT: v_add_u16_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 975; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v6 976; VI-NEXT: v_or_b32_sdwa v0, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 977; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 978; VI-NEXT: v_mov_b32_e32 v2, s0 979; VI-NEXT: v_add_u16_e32 v0, s0, v0 980; VI-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 981; VI-NEXT: v_or_b32_e32 v0, v0, v1 982; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 983; VI-NEXT: s_endpgm 984; 985; GFX10-LABEL: load_v4i8_to_v4f32_2_uses: 986; GFX10: ; %bb.0: 987; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 988; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 989; GFX10-NEXT: v_mov_b32_e32 v1, 24 990; GFX10-NEXT: s_waitcnt lgkmcnt(0) 991; GFX10-NEXT: global_load_dword v0, v0, s[2:3] 992; GFX10-NEXT: s_clause 0x1 993; GFX10-NEXT: s_waitcnt_depctr 0xffe3 994; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 995; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 996; GFX10-NEXT: s_movk_i32 s0, 0x900 997; GFX10-NEXT: s_waitcnt vmcnt(0) 998; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0 999; GFX10-NEXT: v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 1000; GFX10-NEXT: v_and_b32_e32 v3, 0xffffff00, v0 1001; GFX10-NEXT: v_add_nc_u16 v4, v0, 9 1002; GFX10-NEXT: v_add_nc_u16 v2, v2, 9 1003; GFX10-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 1004; GFX10-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 1005; GFX10-NEXT: v_mov_b32_e32 v4, 0 1006; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 1007; GFX10-NEXT: v_add_nc_u16 v1, v1, s0 1008; GFX10-NEXT: v_add_nc_u16 v5, v2, s0 1009; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 1010; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v1 1011; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 1012; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 1013; GFX10-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1014; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1015; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 1016; GFX10-NEXT: global_store_dword v4, v5, s[4:5] 1017; GFX10-NEXT: s_endpgm 1018 %tid.x = call i32 @llvm.amdgcn.workitem.id.x() 1019 %in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x 1020 %load = load <4 x i8>, <4 x i8> addrspace(1)* %in.ptr, align 4 1021 %cvt = uitofp <4 x i8> %load to <4 x float> 1022 store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16 1023 %add = add <4 x i8> %load, <i8 9, i8 9, i8 9, i8 9> ; Second use of %load 1024 store <4 x i8> %add, <4 x i8> addrspace(1)* %out2, align 4 1025 ret void 1026} 1027 1028; Make sure this doesn't crash. 1029define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8> addrspace(1)* noalias %in) nounwind { 1030; SI-LABEL: load_v7i8_to_v7f32: 1031; SI: ; %bb.0: 1032; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1033; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1034; SI-NEXT: s_mov_b32 s7, 0xf000 1035; SI-NEXT: s_mov_b32 s2, 0 1036; SI-NEXT: s_mov_b32 s3, s7 1037; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 1038; SI-NEXT: v_mov_b32_e32 v1, 0 1039; SI-NEXT: s_waitcnt lgkmcnt(0) 1040; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64 1041; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:1 1042; SI-NEXT: buffer_load_ubyte v6, v[0:1], s[0:3], 0 addr64 offset:2 1043; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 offset:3 1044; SI-NEXT: buffer_load_ubyte v7, v[0:1], s[0:3], 0 addr64 offset:4 1045; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[0:3], 0 addr64 offset:5 1046; SI-NEXT: buffer_load_ubyte v8, v[0:1], s[0:3], 0 addr64 offset:6 1047; SI-NEXT: s_mov_b32 s6, -1 1048; SI-NEXT: s_waitcnt vmcnt(6) 1049; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v2 1050; SI-NEXT: s_waitcnt vmcnt(5) 1051; SI-NEXT: v_cvt_f32_ubyte2_e32 v1, v3 1052; SI-NEXT: s_waitcnt vmcnt(3) 1053; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v4 1054; SI-NEXT: v_or_b32_e32 v3, v9, v6 1055; SI-NEXT: s_waitcnt vmcnt(1) 1056; SI-NEXT: v_cvt_f32_ubyte2_e32 v5, v5 1057; SI-NEXT: s_waitcnt vmcnt(0) 1058; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v8 1059; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:24 1060; SI-NEXT: s_waitcnt expcnt(0) 1061; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 1062; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v7 1063; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v2 1064; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v2 1065; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0 offset:16 1066; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 1067; SI-NEXT: s_endpgm 1068; 1069; VI-LABEL: load_v7i8_to_v7f32: 1070; VI: ; %bb.0: 1071; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1072; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 1073; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 1074; VI-NEXT: s_mov_b32 s7, 0xf000 1075; VI-NEXT: s_mov_b32 s6, -1 1076; VI-NEXT: s_waitcnt lgkmcnt(0) 1077; VI-NEXT: v_mov_b32_e32 v1, s1 1078; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 1079; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1080; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v0 1081; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 1082; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v0 1083; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 1084; VI-NEXT: v_add_u32_e32 v6, vcc, 4, v0 1085; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc 1086; VI-NEXT: v_add_u32_e32 v8, vcc, 5, v0 1087; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc 1088; VI-NEXT: flat_load_ubyte v10, v[4:5] 1089; VI-NEXT: flat_load_ubyte v11, v[6:7] 1090; VI-NEXT: flat_load_ubyte v8, v[8:9] 1091; VI-NEXT: v_add_u32_e32 v4, vcc, 6, v0 1092; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 1093; VI-NEXT: v_add_u32_e32 v6, vcc, 1, v0 1094; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc 1095; VI-NEXT: flat_load_ubyte v6, v[6:7] 1096; VI-NEXT: flat_load_ubyte v4, v[4:5] 1097; VI-NEXT: flat_load_ubyte v2, v[2:3] 1098; VI-NEXT: flat_load_ubyte v0, v[0:1] 1099; VI-NEXT: s_waitcnt vmcnt(4) 1100; VI-NEXT: v_cvt_f32_ubyte2_e32 v5, v8 1101; VI-NEXT: s_waitcnt vmcnt(3) 1102; VI-NEXT: v_cvt_f32_ubyte2_e32 v1, v6 1103; VI-NEXT: s_waitcnt vmcnt(2) 1104; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v4 1105; VI-NEXT: s_waitcnt vmcnt(1) 1106; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 1107; VI-NEXT: v_or_b32_sdwa v2, v2, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 1108; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v11 1109; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v2 1110; VI-NEXT: s_waitcnt vmcnt(0) 1111; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 1112; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v2 1113; VI-NEXT: buffer_store_dwordx3 v[4:6], off, s[4:7], 0 offset:16 1114; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 1115; VI-NEXT: s_endpgm 1116; 1117; GFX10-LABEL: load_v7i8_to_v7f32: 1118; GFX10: ; %bb.0: 1119; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1120; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 1121; GFX10-NEXT: v_mov_b32_e32 v2, 0 1122; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1123; GFX10-NEXT: v_mov_b32_e32 v8, 0 1124; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1125; GFX10-NEXT: s_clause 0x5 1126; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] offset:2 1127; GFX10-NEXT: global_load_ubyte v3, v0, s[2:3] offset:3 1128; GFX10-NEXT: global_load_short_d16 v2, v0, s[2:3] offset:4 1129; GFX10-NEXT: global_load_ubyte v4, v0, s[2:3] offset:6 1130; GFX10-NEXT: global_load_ubyte v5, v0, s[2:3] offset:1 1131; GFX10-NEXT: global_load_ubyte v7, v0, s[2:3] 1132; GFX10-NEXT: s_waitcnt vmcnt(4) 1133; GFX10-NEXT: v_lshl_or_b32 v0, v3, 8, v1 1134; GFX10-NEXT: s_waitcnt vmcnt(2) 1135; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v6, v4 1136; GFX10-NEXT: s_waitcnt vmcnt(1) 1137; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v1, v5 1138; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v5, v2 1139; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1140; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v4, v2 1141; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 1142; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 1143; GFX10-NEXT: s_waitcnt vmcnt(0) 1144; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v7 1145; GFX10-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] offset:16 1146; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] 1147; GFX10-NEXT: s_endpgm 1148 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1149 %gep = getelementptr <7 x i8>, <7 x i8> addrspace(1)* %in, i32 %tid 1150 %load = load <7 x i8>, <7 x i8> addrspace(1)* %gep, align 1 1151 %cvt = uitofp <7 x i8> %load to <7 x float> 1152 store <7 x float> %cvt, <7 x float> addrspace(1)* %out, align 16 1153 ret void 1154} 1155 1156define amdgpu_kernel void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind { 1157; SI-LABEL: load_v8i8_to_v8f32: 1158; SI: ; %bb.0: 1159; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1160; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1161; SI-NEXT: s_mov_b32 s7, 0xf000 1162; SI-NEXT: s_mov_b32 s2, 0 1163; SI-NEXT: s_mov_b32 s3, s7 1164; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 1165; SI-NEXT: v_mov_b32_e32 v1, 0 1166; SI-NEXT: s_waitcnt lgkmcnt(0) 1167; SI-NEXT: buffer_load_dwordx2 v[7:8], v[0:1], s[0:3], 0 addr64 1168; SI-NEXT: s_mov_b32 s6, -1 1169; SI-NEXT: s_waitcnt vmcnt(0) 1170; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v7 1171; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v7 1172; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v7 1173; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v7 1174; SI-NEXT: v_cvt_f32_ubyte3_e32 v7, v8 1175; SI-NEXT: v_cvt_f32_ubyte2_e32 v6, v8 1176; SI-NEXT: v_cvt_f32_ubyte1_e32 v5, v8 1177; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v8 1178; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 1179; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 1180; SI-NEXT: s_endpgm 1181; 1182; VI-LABEL: load_v8i8_to_v8f32: 1183; VI: ; %bb.0: 1184; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1185; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 1186; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 1187; VI-NEXT: s_mov_b32 s7, 0xf000 1188; VI-NEXT: s_mov_b32 s6, -1 1189; VI-NEXT: s_waitcnt lgkmcnt(0) 1190; VI-NEXT: v_mov_b32_e32 v1, s1 1191; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 1192; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1193; VI-NEXT: flat_load_dwordx2 v[7:8], v[0:1] 1194; VI-NEXT: s_waitcnt vmcnt(0) 1195; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v7 1196; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v7 1197; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v7 1198; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v7 1199; VI-NEXT: v_cvt_f32_ubyte3_e32 v7, v8 1200; VI-NEXT: v_cvt_f32_ubyte2_e32 v6, v8 1201; VI-NEXT: v_cvt_f32_ubyte1_e32 v5, v8 1202; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v8 1203; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 1204; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 1205; VI-NEXT: s_endpgm 1206; 1207; GFX10-LABEL: load_v8i8_to_v8f32: 1208; GFX10: ; %bb.0: 1209; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1210; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 1211; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1212; GFX10-NEXT: v_mov_b32_e32 v10, 0 1213; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1214; GFX10-NEXT: global_load_dwordx2 v[8:9], v0, s[2:3] 1215; GFX10-NEXT: s_waitcnt vmcnt(0) 1216; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v7, v9 1217; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v6, v9 1218; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v5, v9 1219; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v4, v9 1220; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v3, v8 1221; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v8 1222; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v8 1223; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v8 1224; GFX10-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] offset:16 1225; GFX10-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] 1226; GFX10-NEXT: s_endpgm 1227 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1228 %gep = getelementptr <8 x i8>, <8 x i8> addrspace(1)* %in, i32 %tid 1229 %load = load <8 x i8>, <8 x i8> addrspace(1)* %gep, align 8 1230 %cvt = uitofp <8 x i8> %load to <8 x float> 1231 store <8 x float> %cvt, <8 x float> addrspace(1)* %out, align 16 1232 ret void 1233} 1234 1235define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { 1236; SI-LABEL: i8_zext_inreg_i32_to_f32: 1237; SI: ; %bb.0: 1238; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1239; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1240; SI-NEXT: s_mov_b32 s7, 0xf000 1241; SI-NEXT: s_mov_b32 s2, 0 1242; SI-NEXT: s_mov_b32 s3, s7 1243; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1244; SI-NEXT: v_mov_b32_e32 v1, 0 1245; SI-NEXT: s_waitcnt lgkmcnt(0) 1246; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 1247; SI-NEXT: s_mov_b32 s6, -1 1248; SI-NEXT: s_waitcnt vmcnt(0) 1249; SI-NEXT: v_add_i32_e32 v0, vcc, 2, v0 1250; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 1251; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1252; SI-NEXT: s_endpgm 1253; 1254; VI-LABEL: i8_zext_inreg_i32_to_f32: 1255; VI: ; %bb.0: 1256; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1257; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 1258; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1259; VI-NEXT: s_mov_b32 s7, 0xf000 1260; VI-NEXT: s_mov_b32 s6, -1 1261; VI-NEXT: s_waitcnt lgkmcnt(0) 1262; VI-NEXT: v_mov_b32_e32 v1, s1 1263; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 1264; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1265; VI-NEXT: flat_load_dword v0, v[0:1] 1266; VI-NEXT: s_waitcnt vmcnt(0) 1267; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0 1268; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 1269; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1270; VI-NEXT: s_endpgm 1271; 1272; GFX10-LABEL: i8_zext_inreg_i32_to_f32: 1273; GFX10: ; %bb.0: 1274; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1275; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1276; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1277; GFX10-NEXT: v_mov_b32_e32 v1, 0 1278; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1279; GFX10-NEXT: global_load_dword v0, v0, s[2:3] 1280; GFX10-NEXT: s_waitcnt vmcnt(0) 1281; GFX10-NEXT: v_add_nc_u32_e32 v0, 2, v0 1282; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 1283; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 1284; GFX10-NEXT: s_endpgm 1285 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1286 %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid 1287 %load = load i32, i32 addrspace(1)* %gep, align 4 1288 %add = add i32 %load, 2 1289 %inreg = and i32 %add, 255 1290 %cvt = uitofp i32 %inreg to float 1291 store float %cvt, float addrspace(1)* %out, align 4 1292 ret void 1293} 1294 1295define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { 1296; SI-LABEL: i8_zext_inreg_hi1_to_f32: 1297; SI: ; %bb.0: 1298; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1299; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1300; SI-NEXT: s_mov_b32 s7, 0xf000 1301; SI-NEXT: s_mov_b32 s2, 0 1302; SI-NEXT: s_mov_b32 s3, s7 1303; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1304; SI-NEXT: v_mov_b32_e32 v1, 0 1305; SI-NEXT: s_waitcnt lgkmcnt(0) 1306; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 1307; SI-NEXT: s_mov_b32 s6, -1 1308; SI-NEXT: s_waitcnt vmcnt(0) 1309; SI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 1310; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1311; SI-NEXT: s_endpgm 1312; 1313; VI-LABEL: i8_zext_inreg_hi1_to_f32: 1314; VI: ; %bb.0: 1315; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1316; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 1317; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1318; VI-NEXT: s_mov_b32 s7, 0xf000 1319; VI-NEXT: s_mov_b32 s6, -1 1320; VI-NEXT: s_waitcnt lgkmcnt(0) 1321; VI-NEXT: v_mov_b32_e32 v1, s1 1322; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 1323; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1324; VI-NEXT: flat_load_dword v0, v[0:1] 1325; VI-NEXT: s_waitcnt vmcnt(0) 1326; VI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 1327; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1328; VI-NEXT: s_endpgm 1329; 1330; GFX10-LABEL: i8_zext_inreg_hi1_to_f32: 1331; GFX10: ; %bb.0: 1332; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1333; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1334; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1335; GFX10-NEXT: v_mov_b32_e32 v1, 0 1336; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1337; GFX10-NEXT: global_load_dword v0, v0, s[2:3] 1338; GFX10-NEXT: s_waitcnt vmcnt(0) 1339; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 1340; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 1341; GFX10-NEXT: s_endpgm 1342 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1343 %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid 1344 %load = load i32, i32 addrspace(1)* %gep, align 4 1345 %inreg = and i32 %load, 65280 1346 %shr = lshr i32 %inreg, 8 1347 %cvt = uitofp i32 %shr to float 1348 store float %cvt, float addrspace(1)* %out, align 4 1349 ret void 1350} 1351 1352; We don't get these ones because of the zext, but instcombine removes 1353; them so it shouldn't really matter. 1354define amdgpu_kernel void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind { 1355; SI-LABEL: i8_zext_i32_to_f32: 1356; SI: ; %bb.0: 1357; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1358; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1359; SI-NEXT: s_mov_b32 s7, 0xf000 1360; SI-NEXT: v_mov_b32_e32 v1, 0 1361; SI-NEXT: s_mov_b32 s2, 0 1362; SI-NEXT: s_mov_b32 s3, s7 1363; SI-NEXT: s_waitcnt lgkmcnt(0) 1364; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 1365; SI-NEXT: s_mov_b32 s6, -1 1366; SI-NEXT: s_waitcnt vmcnt(0) 1367; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 1368; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1369; SI-NEXT: s_endpgm 1370; 1371; VI-LABEL: i8_zext_i32_to_f32: 1372; VI: ; %bb.0: 1373; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1374; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 1375; VI-NEXT: s_mov_b32 s7, 0xf000 1376; VI-NEXT: s_mov_b32 s6, -1 1377; VI-NEXT: s_waitcnt lgkmcnt(0) 1378; VI-NEXT: v_mov_b32_e32 v1, s1 1379; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 1380; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1381; VI-NEXT: flat_load_ubyte v0, v[0:1] 1382; VI-NEXT: s_waitcnt vmcnt(0) 1383; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 1384; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1385; VI-NEXT: s_endpgm 1386; 1387; GFX10-LABEL: i8_zext_i32_to_f32: 1388; GFX10: ; %bb.0: 1389; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1390; GFX10-NEXT: v_mov_b32_e32 v1, 0 1391; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1392; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1393; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] 1394; GFX10-NEXT: s_waitcnt vmcnt(0) 1395; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 1396; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 1397; GFX10-NEXT: s_endpgm 1398 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1399 %gep = getelementptr i8, i8 addrspace(1)* %in, i32 %tid 1400 %load = load i8, i8 addrspace(1)* %gep, align 1 1401 %ext = zext i8 %load to i32 1402 %cvt = uitofp i32 %ext to float 1403 store float %cvt, float addrspace(1)* %out, align 4 1404 ret void 1405} 1406 1407define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind { 1408; SI-LABEL: v4i8_zext_v4i32_to_v4f32: 1409; SI: ; %bb.0: 1410; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1411; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1412; SI-NEXT: s_mov_b32 s7, 0xf000 1413; SI-NEXT: s_mov_b32 s2, 0 1414; SI-NEXT: s_mov_b32 s3, s7 1415; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1416; SI-NEXT: v_mov_b32_e32 v1, 0 1417; SI-NEXT: s_waitcnt lgkmcnt(0) 1418; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 1419; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64 offset:1 1420; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:2 1421; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:3 1422; SI-NEXT: s_mov_b32 s6, -1 1423; SI-NEXT: s_waitcnt vmcnt(2) 1424; SI-NEXT: v_cvt_f32_ubyte2_e32 v1, v2 1425; SI-NEXT: s_waitcnt vmcnt(0) 1426; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 1427; SI-NEXT: v_or_b32_e32 v0, v0, v3 1428; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1429; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 1430; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 1431; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 1432; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 1433; SI-NEXT: s_endpgm 1434; 1435; VI-LABEL: v4i8_zext_v4i32_to_v4f32: 1436; VI: ; %bb.0: 1437; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1438; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 1439; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1440; VI-NEXT: s_mov_b32 s7, 0xf000 1441; VI-NEXT: s_mov_b32 s6, -1 1442; VI-NEXT: s_waitcnt lgkmcnt(0) 1443; VI-NEXT: v_mov_b32_e32 v1, s1 1444; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 1445; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1446; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v0 1447; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 1448; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v0 1449; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 1450; VI-NEXT: v_add_u32_e32 v6, vcc, 1, v0 1451; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc 1452; VI-NEXT: flat_load_ubyte v4, v[4:5] 1453; VI-NEXT: flat_load_ubyte v5, v[6:7] 1454; VI-NEXT: flat_load_ubyte v2, v[2:3] 1455; VI-NEXT: flat_load_ubyte v0, v[0:1] 1456; VI-NEXT: s_waitcnt vmcnt(1) 1457; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 1458; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 1459; VI-NEXT: s_waitcnt vmcnt(0) 1460; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 1461; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v1 1462; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v1 1463; VI-NEXT: v_cvt_f32_ubyte2_e32 v1, v5 1464; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 1465; VI-NEXT: s_endpgm 1466; 1467; GFX10-LABEL: v4i8_zext_v4i32_to_v4f32: 1468; GFX10: ; %bb.0: 1469; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1470; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1471; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1472; GFX10-NEXT: v_mov_b32_e32 v5, 0 1473; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1474; GFX10-NEXT: s_clause 0x3 1475; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] offset:3 1476; GFX10-NEXT: global_load_ubyte v2, v0, s[2:3] offset:2 1477; GFX10-NEXT: global_load_ubyte v3, v0, s[2:3] offset:1 1478; GFX10-NEXT: global_load_ubyte v4, v0, s[2:3] 1479; GFX10-NEXT: s_waitcnt vmcnt(2) 1480; GFX10-NEXT: v_lshl_or_b32 v0, v1, 8, v2 1481; GFX10-NEXT: s_waitcnt vmcnt(1) 1482; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v1, v3 1483; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1484; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 1485; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 1486; GFX10-NEXT: s_waitcnt vmcnt(0) 1487; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 1488; GFX10-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] 1489; GFX10-NEXT: s_endpgm 1490 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1491 %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid 1492 %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 1 1493 %ext = zext <4 x i8> %load to <4 x i32> 1494 %cvt = uitofp <4 x i32> %ext to <4 x float> 1495 store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16 1496 ret void 1497} 1498 1499define amdgpu_kernel void @extract_byte0_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { 1500; SI-LABEL: extract_byte0_to_f32: 1501; SI: ; %bb.0: 1502; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1503; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1504; SI-NEXT: s_mov_b32 s7, 0xf000 1505; SI-NEXT: s_mov_b32 s2, 0 1506; SI-NEXT: s_mov_b32 s3, s7 1507; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1508; SI-NEXT: v_mov_b32_e32 v1, 0 1509; SI-NEXT: s_waitcnt lgkmcnt(0) 1510; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 1511; SI-NEXT: s_mov_b32 s6, -1 1512; SI-NEXT: s_waitcnt vmcnt(0) 1513; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 1514; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1515; SI-NEXT: s_endpgm 1516; 1517; VI-LABEL: extract_byte0_to_f32: 1518; VI: ; %bb.0: 1519; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1520; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 1521; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1522; VI-NEXT: s_mov_b32 s7, 0xf000 1523; VI-NEXT: s_mov_b32 s6, -1 1524; VI-NEXT: s_waitcnt lgkmcnt(0) 1525; VI-NEXT: v_mov_b32_e32 v1, s1 1526; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 1527; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1528; VI-NEXT: flat_load_dword v0, v[0:1] 1529; VI-NEXT: s_waitcnt vmcnt(0) 1530; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 1531; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1532; VI-NEXT: s_endpgm 1533; 1534; GFX10-LABEL: extract_byte0_to_f32: 1535; GFX10: ; %bb.0: 1536; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1537; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1538; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1539; GFX10-NEXT: v_mov_b32_e32 v1, 0 1540; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1541; GFX10-NEXT: global_load_dword v0, v0, s[2:3] 1542; GFX10-NEXT: s_waitcnt vmcnt(0) 1543; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 1544; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 1545; GFX10-NEXT: s_endpgm 1546 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1547 %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid 1548 %val = load i32, i32 addrspace(1)* %gep 1549 %and = and i32 %val, 255 1550 %cvt = uitofp i32 %and to float 1551 store float %cvt, float addrspace(1)* %out 1552 ret void 1553} 1554 1555define amdgpu_kernel void @extract_byte1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { 1556; SI-LABEL: extract_byte1_to_f32: 1557; SI: ; %bb.0: 1558; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1559; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1560; SI-NEXT: s_mov_b32 s7, 0xf000 1561; SI-NEXT: s_mov_b32 s2, 0 1562; SI-NEXT: s_mov_b32 s3, s7 1563; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1564; SI-NEXT: v_mov_b32_e32 v1, 0 1565; SI-NEXT: s_waitcnt lgkmcnt(0) 1566; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 1567; SI-NEXT: s_mov_b32 s6, -1 1568; SI-NEXT: s_waitcnt vmcnt(0) 1569; SI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 1570; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1571; SI-NEXT: s_endpgm 1572; 1573; VI-LABEL: extract_byte1_to_f32: 1574; VI: ; %bb.0: 1575; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1576; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 1577; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1578; VI-NEXT: s_mov_b32 s7, 0xf000 1579; VI-NEXT: s_mov_b32 s6, -1 1580; VI-NEXT: s_waitcnt lgkmcnt(0) 1581; VI-NEXT: v_mov_b32_e32 v1, s1 1582; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 1583; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1584; VI-NEXT: flat_load_dword v0, v[0:1] 1585; VI-NEXT: s_waitcnt vmcnt(0) 1586; VI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 1587; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1588; VI-NEXT: s_endpgm 1589; 1590; GFX10-LABEL: extract_byte1_to_f32: 1591; GFX10: ; %bb.0: 1592; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1593; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1594; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1595; GFX10-NEXT: v_mov_b32_e32 v1, 0 1596; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1597; GFX10-NEXT: global_load_dword v0, v0, s[2:3] 1598; GFX10-NEXT: s_waitcnt vmcnt(0) 1599; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 1600; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 1601; GFX10-NEXT: s_endpgm 1602 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1603 %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid 1604 %val = load i32, i32 addrspace(1)* %gep 1605 %srl = lshr i32 %val, 8 1606 %and = and i32 %srl, 255 1607 %cvt = uitofp i32 %and to float 1608 store float %cvt, float addrspace(1)* %out 1609 ret void 1610} 1611 1612define amdgpu_kernel void @extract_byte2_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { 1613; SI-LABEL: extract_byte2_to_f32: 1614; SI: ; %bb.0: 1615; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1616; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1617; SI-NEXT: s_mov_b32 s7, 0xf000 1618; SI-NEXT: s_mov_b32 s2, 0 1619; SI-NEXT: s_mov_b32 s3, s7 1620; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1621; SI-NEXT: v_mov_b32_e32 v1, 0 1622; SI-NEXT: s_waitcnt lgkmcnt(0) 1623; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 1624; SI-NEXT: s_mov_b32 s6, -1 1625; SI-NEXT: s_waitcnt vmcnt(0) 1626; SI-NEXT: v_cvt_f32_ubyte2_e32 v0, v0 1627; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1628; SI-NEXT: s_endpgm 1629; 1630; VI-LABEL: extract_byte2_to_f32: 1631; VI: ; %bb.0: 1632; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1633; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 1634; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1635; VI-NEXT: s_mov_b32 s7, 0xf000 1636; VI-NEXT: s_mov_b32 s6, -1 1637; VI-NEXT: s_waitcnt lgkmcnt(0) 1638; VI-NEXT: v_mov_b32_e32 v1, s1 1639; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 1640; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1641; VI-NEXT: flat_load_dword v0, v[0:1] 1642; VI-NEXT: s_waitcnt vmcnt(0) 1643; VI-NEXT: v_cvt_f32_ubyte2_e32 v0, v0 1644; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1645; VI-NEXT: s_endpgm 1646; 1647; GFX10-LABEL: extract_byte2_to_f32: 1648; GFX10: ; %bb.0: 1649; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1650; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1651; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1652; GFX10-NEXT: v_mov_b32_e32 v1, 0 1653; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1654; GFX10-NEXT: global_load_dword v0, v0, s[2:3] 1655; GFX10-NEXT: s_waitcnt vmcnt(0) 1656; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v0, v0 1657; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 1658; GFX10-NEXT: s_endpgm 1659 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1660 %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid 1661 %val = load i32, i32 addrspace(1)* %gep 1662 %srl = lshr i32 %val, 16 1663 %and = and i32 %srl, 255 1664 %cvt = uitofp i32 %and to float 1665 store float %cvt, float addrspace(1)* %out 1666 ret void 1667} 1668 1669define amdgpu_kernel void @extract_byte3_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { 1670; SI-LABEL: extract_byte3_to_f32: 1671; SI: ; %bb.0: 1672; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1673; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1674; SI-NEXT: s_mov_b32 s7, 0xf000 1675; SI-NEXT: s_mov_b32 s2, 0 1676; SI-NEXT: s_mov_b32 s3, s7 1677; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1678; SI-NEXT: v_mov_b32_e32 v1, 0 1679; SI-NEXT: s_waitcnt lgkmcnt(0) 1680; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 1681; SI-NEXT: s_mov_b32 s6, -1 1682; SI-NEXT: s_waitcnt vmcnt(0) 1683; SI-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 1684; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1685; SI-NEXT: s_endpgm 1686; 1687; VI-LABEL: extract_byte3_to_f32: 1688; VI: ; %bb.0: 1689; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1690; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 1691; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1692; VI-NEXT: s_mov_b32 s7, 0xf000 1693; VI-NEXT: s_mov_b32 s6, -1 1694; VI-NEXT: s_waitcnt lgkmcnt(0) 1695; VI-NEXT: v_mov_b32_e32 v1, s1 1696; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 1697; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1698; VI-NEXT: flat_load_dword v0, v[0:1] 1699; VI-NEXT: s_waitcnt vmcnt(0) 1700; VI-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 1701; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1702; VI-NEXT: s_endpgm 1703; 1704; GFX10-LABEL: extract_byte3_to_f32: 1705; GFX10: ; %bb.0: 1706; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1707; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1708; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1709; GFX10-NEXT: v_mov_b32_e32 v1, 0 1710; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1711; GFX10-NEXT: global_load_dword v0, v0, s[2:3] 1712; GFX10-NEXT: s_waitcnt vmcnt(0) 1713; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 1714; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 1715; GFX10-NEXT: s_endpgm 1716 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1717 %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid 1718 %val = load i32, i32 addrspace(1)* %gep 1719 %srl = lshr i32 %val, 24 1720 %and = and i32 %srl, 255 1721 %cvt = uitofp i32 %and to float 1722 store float %cvt, float addrspace(1)* %out 1723 ret void 1724} 1725 1726define amdgpu_kernel void @cvt_ubyte0_or_multiuse(i32 addrspace(1)* %in, float addrspace(1)* %out) { 1727; SI-LABEL: cvt_ubyte0_or_multiuse: 1728; SI: ; %bb.0: ; %bb 1729; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1730; SI-NEXT: s_mov_b32 s3, 0xf000 1731; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1732; SI-NEXT: v_mov_b32_e32 v1, 0 1733; SI-NEXT: s_mov_b32 s2, -1 1734; SI-NEXT: s_waitcnt lgkmcnt(0) 1735; SI-NEXT: s_mov_b32 s0, s6 1736; SI-NEXT: s_mov_b32 s1, s7 1737; SI-NEXT: s_mov_b32 s6, 0 1738; SI-NEXT: s_mov_b32 s7, s3 1739; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 1740; SI-NEXT: s_waitcnt vmcnt(0) 1741; SI-NEXT: v_or_b32_e32 v0, 0x80000001, v0 1742; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v0 1743; SI-NEXT: v_add_f32_e32 v0, v0, v1 1744; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1745; SI-NEXT: s_endpgm 1746; 1747; VI-LABEL: cvt_ubyte0_or_multiuse: 1748; VI: ; %bb.0: ; %bb 1749; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1750; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1751; VI-NEXT: s_mov_b32 s3, 0xf000 1752; VI-NEXT: s_mov_b32 s2, -1 1753; VI-NEXT: s_waitcnt lgkmcnt(0) 1754; VI-NEXT: v_mov_b32_e32 v1, s5 1755; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0 1756; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1757; VI-NEXT: flat_load_dword v0, v[0:1] 1758; VI-NEXT: s_mov_b32 s0, s6 1759; VI-NEXT: s_mov_b32 s1, s7 1760; VI-NEXT: s_waitcnt vmcnt(0) 1761; VI-NEXT: v_or_b32_e32 v0, 0x80000001, v0 1762; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v0 1763; VI-NEXT: v_add_f32_e32 v0, v0, v1 1764; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1765; VI-NEXT: s_endpgm 1766; 1767; GFX10-LABEL: cvt_ubyte0_or_multiuse: 1768; GFX10: ; %bb.0: ; %bb 1769; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1770; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1771; GFX10-NEXT: v_mov_b32_e32 v2, 0 1772; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1773; GFX10-NEXT: global_load_dword v0, v0, s[0:1] 1774; GFX10-NEXT: s_waitcnt vmcnt(0) 1775; GFX10-NEXT: v_or_b32_e32 v0, 0x80000001, v0 1776; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v0 1777; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 1778; GFX10-NEXT: global_store_dword v2, v0, s[2:3] 1779; GFX10-NEXT: s_endpgm 1780bb: 1781 %lid = tail call i32 @llvm.amdgcn.workitem.id.x() 1782 %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %lid 1783 %load = load i32, i32 addrspace(1)* %gep 1784 %or = or i32 %load, -2147483647 1785 %and = and i32 %or, 255 1786 %uitofp = uitofp i32 %and to float 1787 %cast = bitcast i32 %or to float 1788 %add = fadd float %cast, %uitofp 1789 store float %add, float addrspace(1)* %out 1790 ret void 1791} 1792