1; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900 %s 2; RUN: llc -march=amdgcn -mcpu=gfx906 -amdgpu-sroa=0 -mattr=-promote-alloca,+sram-ecc -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX906,NO-D16-HI %s 3; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX803,NO-D16-HI %s 4 5; GCN-LABEL: {{^}}load_local_lo_v2i16_undeflo: 6; GCN: s_waitcnt 7; GFX900-NEXT: ds_read_u16_d16 v0, v0 8; GFX900-NEXT: s_waitcnt 9; GFX900-NEXT: s_setpc_b64 10 11; NO-D16-HI: ds_read_u16 12define <2 x i16> @load_local_lo_v2i16_undeflo(i16 addrspace(3)* %in) #0 { 13entry: 14 %load = load i16, i16 addrspace(3)* %in 15 %build = insertelement <2 x i16> undef, i16 %load, i32 0 16 ret <2 x i16> %build 17} 18 19; GCN-LABEL: {{^}}load_local_lo_v2i16_reglo: 20; GCN: s_waitcnt 21; GCN: ds_read_u16 v0, v0 22; GFX9: v_and_b32_e32 v0, 0xffff, v0 23; GFX9: v_lshl_or_b32 v0, v1, 16, v0 24; GFX9: s_setpc_b64 25define <2 x i16> @load_local_lo_v2i16_reglo(i16 addrspace(3)* %in, i16 %reg) #0 { 26entry: 27 %load = load i16, i16 addrspace(3)* %in 28 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 1 29 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 0 30 ret <2 x i16> %build1 31} 32 33; Show that we get reasonable regalloc without physreg constraints. 34; GCN-LABEL: {{^}}load_local_lo_v2i16_reglo_vreg: 35; GCN: s_waitcnt 36; GCN: ds_read_u16 v0, v0 37; GCN: s_waitcnt 38; GFX9: v_and_b32_e32 v0, 0xffff, v0 39; GFX9: v_lshl_or_b32 v0, v1, 16, v0 40; GFX9: global_store_dword v 41define void @load_local_lo_v2i16_reglo_vreg(i16 addrspace(3)* %in, i16 %reg) #0 { 42entry: 43 %load = load i16, i16 addrspace(3)* %in 44 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 1 45 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 0 46 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 47 ret void 48} 49 50; GCN-LABEL: {{^}}load_local_lo_v2i16_zerolo: 51; GCN: s_waitcnt 52; GFX900-NEXT: v_mov_b32_e32 v1, 0 53; GFX900-NEXT: ds_read_u16_d16 v1, v0 54; GFX900-NEXT: s_waitcnt 55; GFX900-NEXT: v_mov_b32_e32 v0, v1 56; GFX900-NEXT: s_setpc_b64 57 58; NO-D16-HI: ds_read_u16 v 59define <2 x i16> @load_local_lo_v2i16_zerolo(i16 addrspace(3)* %in) #0 { 60entry: 61 %load = load i16, i16 addrspace(3)* %in 62 %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 0 63 ret <2 x i16> %build 64} 65 66; GCN-LABEL: {{^}}load_local_lo_v2f16_fpimm: 67; GCN: s_waitcnt 68; GFX900-NEXT: v_mov_b32_e32 v1, 2.0 69; GFX900-NEXT: ds_read_u16_d16 v1, v0 70; GFX900-NEXT: s_waitcnt 71; GFX900-NEXT: v_mov_b32_e32 v0, v1 72; GFX900-NEXT: s_setpc_b64 73 74; NO-D16-HI: ds_read_u16 v 75define <2 x half> @load_local_lo_v2f16_fpimm(half addrspace(3)* %in) #0 { 76entry: 77 %load = load half, half addrspace(3)* %in 78 %build = insertelement <2 x half> <half 0.0, half 2.0>, half %load, i32 0 79 ret <2 x half> %build 80} 81 82; GCN-LABEL: {{^}}load_local_lo_v2f16_reghi_vreg: 83; GCN: s_waitcnt 84; GFX900-NEXT: ds_read_u16_d16 v1, v0 85; GFX900-NEXT: s_waitcnt 86; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}} 87; GFX900-NEXT: s_waitcnt 88; GFX900-NEXT: s_setpc_b64 89 90; NO-D16-HI: ds_read_u16 v 91define void @load_local_lo_v2f16_reghi_vreg(half addrspace(3)* %in, i32 %reg) #0 { 92entry: 93 %reg.bc = bitcast i32 %reg to <2 x half> 94 %load = load half, half addrspace(3)* %in 95 %build1 = insertelement <2 x half> %reg.bc, half %load, i32 0 96 store <2 x half> %build1, <2 x half> addrspace(1)* undef 97 ret void 98} 99; GCN-LABEL: {{^}}load_local_lo_v2f16_reglo_vreg: 100 101; GFX900: ds_read_u16 v 102; GFX900: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}} 103; GFX900: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16, v{{[0-9]+}} 104; GFX900: global_store_dword 105 106; NO-D16-HI: ds_read_u16 v 107define void @load_local_lo_v2f16_reglo_vreg(half addrspace(3)* %in, half %reg) #0 { 108entry: 109 %load = load half, half addrspace(3)* %in 110 %build0 = insertelement <2 x half> undef, half %reg, i32 1 111 %build1 = insertelement <2 x half> %build0, half %load, i32 0 112 store <2 x half> %build1, <2 x half> addrspace(1)* undef 113 ret void 114} 115 116; GCN-LABEL: {{^}}load_local_lo_v2i16_reghi_vreg_zexti8: 117; GCN: s_waitcnt 118; GFX900-NEXT: ds_read_u8_d16 v1, v0 119; GFX900-NEXT: s_waitcnt 120; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}} 121; GFX900-NEXT: s_waitcnt 122; GFX900-NEXT: s_setpc_b64 123 124; NO-D16-HI: ds_read_u8 v 125define void @load_local_lo_v2i16_reghi_vreg_zexti8(i8 addrspace(3)* %in, i32 %reg) #0 { 126entry: 127 %reg.bc = bitcast i32 %reg to <2 x i16> 128 %load = load i8, i8 addrspace(3)* %in 129 %ext = zext i8 %load to i16 130 %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0 131 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 132 ret void 133} 134 135; GCN-LABEL: {{^}}load_local_lo_v2i16_reglo_vreg_zexti8: 136; GCN: s_waitcnt 137; GFX900: ds_read_u8 v 138; GFX900: global_store_dword 139; GFX900-NEXT: s_waitcnt 140; GFX900-NEXT: s_setpc_b64 141 142; NO-D16-HI: ds_read_u8 v 143define void @load_local_lo_v2i16_reglo_vreg_zexti8(i8 addrspace(3)* %in, i16 %reg) #0 { 144entry: 145 %load = load i8, i8 addrspace(3)* %in 146 %ext = zext i8 %load to i16 147 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 1 148 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 0 149 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 150 ret void 151} 152 153; GCN-LABEL: {{^}}load_local_lo_v2i16_reghi_vreg_sexti8: 154; GCN: s_waitcnt 155; GFX900-NEXT: ds_read_i8_d16 v1, v0 156; GFX900-NEXT: s_waitcnt 157; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}} 158; GFX900-NEXT: s_waitcnt 159; GFX900-NEXT: s_setpc_b64 160 161; NO-D16-HI: ds_read_i8 v 162define void @load_local_lo_v2i16_reghi_vreg_sexti8(i8 addrspace(3)* %in, i32 %reg) #0 { 163entry: 164 %reg.bc = bitcast i32 %reg to <2 x i16> 165 %load = load i8, i8 addrspace(3)* %in 166 %ext = sext i8 %load to i16 167 %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0 168 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 169 ret void 170} 171 172; GCN-LABEL: {{^}}load_local_lo_v2i16_reglo_vreg_sexti8: 173; GCN: s_waitcnt 174; GFX900: ds_read_i8 v 175; GFX900: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}} 176; GFX900: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16, v{{[0-9]+}} 177 178; NO-D16-HI: ds_read_i8 v 179define void @load_local_lo_v2i16_reglo_vreg_sexti8(i8 addrspace(3)* %in, i16 %reg) #0 { 180entry: 181 %load = load i8, i8 addrspace(3)* %in 182 %ext = sext i8 %load to i16 183 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 1 184 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 0 185 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 186 ret void 187} 188 189; GCN-LABEL: {{^}}load_local_lo_v2f16_reglo_vreg_zexti8: 190; GCN: s_waitcnt 191; GFX900: ds_read_u8 v 192; GFX900: global_store_dword 193; GFX900-NEXT: s_waitcnt 194; GFX900-NEXT: s_setpc_b64 195 196; NO-D16-HI: ds_read_u8 v 197define void @load_local_lo_v2f16_reglo_vreg_zexti8(i8 addrspace(3)* %in, half %reg) #0 { 198entry: 199 %load = load i8, i8 addrspace(3)* %in 200 %ext = zext i8 %load to i16 201 %bitcast = bitcast i16 %ext to half 202 %build0 = insertelement <2 x half> undef, half %reg, i32 1 203 %build1 = insertelement <2 x half> %build0, half %bitcast, i32 0 204 store <2 x half> %build1, <2 x half> addrspace(1)* undef 205 ret void 206} 207 208; GCN-LABEL: {{^}}load_local_lo_v2f16_reglo_vreg_sexti8: 209; GCN: s_waitcnt 210; GFX900: ds_read_i8 v 211; GFX900: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}} 212; GFX900: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16, v{{[0-9]+}} 213 214; NO-D16-HI: ds_read_i8 v 215define void @load_local_lo_v2f16_reglo_vreg_sexti8(i8 addrspace(3)* %in, half %reg) #0 { 216entry: 217 %load = load i8, i8 addrspace(3)* %in 218 %ext = sext i8 %load to i16 219 %bitcast = bitcast i16 %ext to half 220 %build0 = insertelement <2 x half> undef, half %reg, i32 1 221 %build1 = insertelement <2 x half> %build0, half %bitcast, i32 0 222 store <2 x half> %build1, <2 x half> addrspace(1)* undef 223 ret void 224} 225 226; GCN-LABEL: {{^}}load_local_lo_v2i16_reghi_vreg_multi_use_lo: 227; GFX900: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 228; GFX900: ds_read_u16 v0, v0 229; GFX900: v_mov_b32_e32 v3, 0 230; GFX900: v_mov_b32_e32 v2, 0xffff 231; GFX900: s_waitcnt lgkmcnt(0) 232; GFX900: ds_write_b16 v3, v0 233; GFX900: v_bfi_b32 v0, v2, v0, v1 234; GFX900: global_store_dword v[0:1], v0, off 235; GFX900: s_waitcnt vmcnt(0) lgkmcnt(0) 236; GFX900: s_setpc_b64 s[30:31] 237 238; NO-D16-HI: ds_read_u16 v 239define void @load_local_lo_v2i16_reghi_vreg_multi_use_lo(i16 addrspace(3)* %in, <2 x i16> %reg) #0 { 240entry: 241 %load = load i16, i16 addrspace(3)* %in 242 %elt1 = extractelement <2 x i16> %reg, i32 1 243 store i16 %load, i16 addrspace(3)* null 244 %build1 = insertelement <2 x i16> %reg, i16 %load, i32 0 245 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 246 ret void 247} 248 249; GCN-LABEL: {{^}}load_local_lo_v2i16_reghi_vreg_multi_use_hi: 250; GFX900: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 251; GFX900: v_lshrrev_b32_e32 v2, 16, v1 252; GFX900: ds_read_u16_d16 v1, v0 253; GFX900: v_mov_b32_e32 v0, 0 254; GFX900: ds_write_b16 v0, v2 255; GFX900: s_waitcnt lgkmcnt(1) 256; GFX900: global_store_dword v[0:1], v1, off 257; GFX900: s_waitcnt vmcnt(0) lgkmcnt(0) 258; GFX900: s_setpc_b64 s[30:31] 259 260; NO-D16-HI: ds_read_u16 v 261define void @load_local_lo_v2i16_reghi_vreg_multi_use_hi(i16 addrspace(3)* %in, <2 x i16> %reg) #0 { 262entry: 263 %load = load i16, i16 addrspace(3)* %in 264 %elt1 = extractelement <2 x i16> %reg, i32 1 265 store i16 %elt1, i16 addrspace(3)* null 266 %build1 = insertelement <2 x i16> %reg, i16 %load, i32 0 267 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 268 ret void 269} 270 271; GCN-LABEL: {{^}}load_local_lo_v2i16_reghi_vreg_multi_use_lohi: 272; GFX900: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 273; GFX900: ds_read_u16 v0, v0 274; GFX900: v_lshrrev_b32_e32 v[[A_F16:[0-9]+]], 16, v1 275; GFX900: v_mov_b32_e32 v[[A_F32:[0-9]+]], 0xffff 276; GFX900: s_waitcnt lgkmcnt(0) 277; GFX900: ds_write_b16 v2, v0 278; GFX900: ds_write_b16 v3, v[[A_F16]] 279; GFX900: v_bfi_b32 v0, v[[A_F32]], v0, v1 280; GFX900: global_store_dword v[0:1], v0, off 281; GFX900: s_waitcnt vmcnt(0) lgkmcnt(0) 282; GFX900: s_setpc_b64 s[30:31] 283 284; NO-D16-HI: ds_read_u16 v 285define void @load_local_lo_v2i16_reghi_vreg_multi_use_lohi(i16 addrspace(3)* noalias %in, <2 x i16> %reg, i16 addrspace(3)* noalias %out0, i16 addrspace(3)* noalias %out1) #0 { 286entry: 287 %load = load i16, i16 addrspace(3)* %in 288 %elt1 = extractelement <2 x i16> %reg, i32 1 289 store i16 %load, i16 addrspace(3)* %out0 290 store i16 %elt1, i16 addrspace(3)* %out1 291 %build1 = insertelement <2 x i16> %reg, i16 %load, i32 0 292 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 293 ret void 294} 295 296; GCN-LABEL: {{^}}load_global_lo_v2i16_reglo_vreg: 297; GCN: s_waitcnt 298; GFX900-NEXT: global_load_short_d16 v2, v[0:1], off offset:-4094 299; GFX900-NEXT: s_waitcnt 300; GFX900-NEXT: global_store_dword 301; GFX900-NEXT: s_waitcnt 302; GFX900-NEXT: s_setpc_b64 303 304; GFX906: global_load_ushort v0, v[0:1], off offset:-4094 305; GFX906: v_bfi_b32 306define void @load_global_lo_v2i16_reglo_vreg(i16 addrspace(1)* %in, i32 %reg) #0 { 307entry: 308 %reg.bc = bitcast i32 %reg to <2 x i16> 309 %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 -2047 310 %load = load i16, i16 addrspace(1)* %gep 311 %build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0 312 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 313 ret void 314} 315 316; GCN-LABEL: {{^}}load_global_lo_v2f16_reglo_vreg: 317; GCN: s_waitcnt 318; GFX900-NEXT: global_load_short_d16 v2, v[0:1], off offset:-4094 319; GFX900-NEXT: s_waitcnt 320; GFX900-NEXT: global_store_dword 321; GFX900-NEXT: s_waitcnt 322; GFX900-NEXT: s_setpc_b64 323 324; GFX906: global_load_ushort v0, v[0:1], off offset:-4094 325; GFX906: v_lshrrev_b32 326; GFX906: v_and_b32_e32 327; GFX906: v_lshl_or_b32 328 329; GFX803: flat_load_ushort 330define void @load_global_lo_v2f16_reglo_vreg(half addrspace(1)* %in, i32 %reg) #0 { 331entry: 332 %reg.bc = bitcast i32 %reg to <2 x half> 333 %gep = getelementptr inbounds half, half addrspace(1)* %in, i64 -2047 334 %load = load half, half addrspace(1)* %gep 335 %build1 = insertelement <2 x half> %reg.bc, half %load, i32 0 336 store <2 x half> %build1, <2 x half> addrspace(1)* undef 337 ret void 338} 339 340; GCN-LABEL: {{^}}load_global_lo_v2i16_reglo_vreg_zexti8: 341; GCN: s_waitcnt 342; GFX900-NEXT: global_load_ubyte_d16 v2, v[0:1], off offset:-4095 343; GFX900-NEXT: s_waitcnt 344; GFX900-NEXT: global_store_dword 345; GFX900-NEXT: s_waitcnt 346; GFX900-NEXT: s_setpc_b64 347 348; GFX906: global_load_ubyte v0, v[0:1], off offset:-4095 349; GFX906: v_bfi_b32 350 351; GFX803: flat_load_ubyte 352define void @load_global_lo_v2i16_reglo_vreg_zexti8(i8 addrspace(1)* %in, i32 %reg) #0 { 353entry: 354 %reg.bc = bitcast i32 %reg to <2 x i16> 355 %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095 356 %load = load i8, i8 addrspace(1)* %gep 357 %ext = zext i8 %load to i16 358 %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0 359 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 360 ret void 361} 362 363; GCN-LABEL: {{^}}load_global_lo_v2i16_reglo_vreg_sexti8: 364; GCN: s_waitcnt 365; GFX900-NEXT: global_load_sbyte_d16 v2, v[0:1], off offset:-4095 366; GFX900-NEXT: s_waitcnt 367; GFX900-NEXT: global_store_dword 368; GFX900-NEXT: s_waitcnt 369; GFX900-NEXT: s_setpc_b64 370 371; GFX906: global_load_sbyte v0, v[0:1], off offset:-4095 372; GFX906: v_bfi_b32 373 374; GFX803: flat_load_sbyte 375define void @load_global_lo_v2i16_reglo_vreg_sexti8(i8 addrspace(1)* %in, i32 %reg) #0 { 376entry: 377 %reg.bc = bitcast i32 %reg to <2 x i16> 378 %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095 379 %load = load i8, i8 addrspace(1)* %gep 380 %ext = sext i8 %load to i16 381 %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0 382 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 383 ret void 384} 385 386; GCN-LABEL: {{^}}load_global_lo_v2f16_reglo_vreg_zexti8: 387; GCN: s_waitcnt 388; GFX900-NEXT: global_load_ubyte_d16 v2, v[0:1], off offset:-4095 389; GFX900-NEXT: s_waitcnt 390; GFX900-NEXT: global_store_dword 391; GFX900-NEXT: s_waitcnt 392; GFX900-NEXT: s_setpc_b64 393 394; GFX906: global_load_ubyte v0, v[0:1], off offset:-4095 395; GFX906: v_and_b32_e32 396; GFX906: v_lshl_or_b32 397 398; GFX803: flat_load_ubyte 399define void @load_global_lo_v2f16_reglo_vreg_zexti8(i8 addrspace(1)* %in, i32 %reg) #0 { 400entry: 401 %reg.bc = bitcast i32 %reg to <2 x half> 402 %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095 403 %load = load i8, i8 addrspace(1)* %gep 404 %ext = zext i8 %load to i16 405 %bitcast = bitcast i16 %ext to half 406 %build1 = insertelement <2 x half> %reg.bc, half %bitcast, i32 0 407 store <2 x half> %build1, <2 x half> addrspace(1)* undef 408 ret void 409} 410 411; GCN-LABEL: {{^}}load_global_lo_v2f16_reglo_vreg_sexti8: 412; GCN: s_waitcnt 413; GFX900-NEXT: global_load_sbyte_d16 v2, v[0:1], off offset:-4095 414; GFX900-NEXT: s_waitcnt 415; GFX900-NEXT: global_store_dword 416; GFX900-NEXT: s_waitcnt 417; GFX900-NEXT: s_setpc_b64 418 419; GFX906: global_load_sbyte v0, v[0:1], off offset:-4095 420; GFX906: v_lshrrev_b32 421; GFX906: v_and_b32 422; GFX906: v_lshl_or_b32 423 424; GFX803: flat_load_sbyte 425define void @load_global_lo_v2f16_reglo_vreg_sexti8(i8 addrspace(1)* %in, i32 %reg) #0 { 426entry: 427 %reg.bc = bitcast i32 %reg to <2 x half> 428 %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095 429 %load = load i8, i8 addrspace(1)* %gep 430 %ext = sext i8 %load to i16 431 %bitcast = bitcast i16 %ext to half 432 %build1 = insertelement <2 x half> %reg.bc, half %bitcast, i32 0 433 store <2 x half> %build1, <2 x half> addrspace(1)* undef 434 ret void 435} 436 437; GCN-LABEL: {{^}}load_flat_lo_v2i16_reghi_vreg: 438; GCN: s_waitcnt 439; GFX900-NEXT: flat_load_short_d16 v2, v[0:1] 440; GFX900-NEXT: s_waitcnt 441; GFX900-NEXT: global_store_dword v[0:1], v2 442; GFX900-NEXT: s_waitcnt 443; GFX900-NEXT: s_setpc_b64 444 445; GFX803: flat_load_ushort v{{[0-9]+}} 446; GFX803: v_or_b32_e32 447 448; GFX906: flat_load_ushort [[LOAD:v[0-9]+]] 449; GFX906: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff{{$}} 450; GFX906: v_bfi_b32 v{{[0-9]+}}, [[MASK]], [[LOAD]], v2 451; GFX906: global_store_dword 452define void @load_flat_lo_v2i16_reghi_vreg(i16* %in, i32 %reg) #0 { 453entry: 454 %reg.bc = bitcast i32 %reg to <2 x i16> 455 %load = load i16, i16* %in 456 %build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0 457 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 458 ret void 459} 460 461; GCN-LABEL: {{^}}load_flat_lo_v2f16_reghi_vreg: 462; GCN: s_waitcnt 463; GFX900-NEXT: flat_load_short_d16 v2, v[0:1] 464; GFX900-NEXT: s_waitcnt 465; GFX900-NEXT: global_store_dword v[0:1], v2 466; GFX900-NEXT: s_waitcnt 467; GFX900-NEXT: s_setpc_b64 468 469; GFX803: flat_load_ushort v{{[0-9]+}} 470; GFX803: v_or_b32_e32 471 472; FIXME: and should be removable 473; GFX906: flat_load_ushort [[LOAD:v[0-9]+]] 474; GFX906: v_lshrrev_b32_e32 [[SHR:v[0-9]+]], 16, v2 475; GFX906: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, [[LOAD]] 476; GFX906: v_lshl_or_b32 [[LSHL_OR:v[0-9]+]], [[SHR]], 16, [[AND]] 477; GFX906: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[LSHL_OR]] 478define void @load_flat_lo_v2f16_reghi_vreg(half* %in, i32 %reg) #0 { 479entry: 480 %reg.bc = bitcast i32 %reg to <2 x half> 481 %load = load half, half* %in 482 %build1 = insertelement <2 x half> %reg.bc, half %load, i32 0 483 store <2 x half> %build1, <2 x half> addrspace(1)* undef 484 ret void 485} 486 487; GCN-LABEL: {{^}}load_flat_lo_v2i16_reglo_vreg_zexti8: 488; GCN: s_waitcnt 489; GFX900-NEXT: flat_load_ubyte_d16 v2, v[0:1] 490; GFX900-NEXT: s_waitcnt 491; GFX900-NEXT: global_store_dword v[0:1], v2 492; GFX900-NEXT: s_waitcnt 493; GFX900-NEXT: s_setpc_b64 494 495; GFX803: flat_load_ubyte [[LO:v[0-9]+]] 496; GFX803: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, v2 497; GFX803: s_mov_b32 [[MASK:s[0-9]+]], 0x5040c00 498; GFX803: v_perm_b32 [[RES:v[0-9]+]], [[HI]], [[LO]], [[MASK]] 499; GFX803: flat_store_dword v[0:1], [[RES]] 500 501; GFX906: flat_load_ubyte 502; GFX906: v_bfi_b32 503define void @load_flat_lo_v2i16_reglo_vreg_zexti8(i8* %in, i32 %reg) #0 { 504entry: 505 %reg.bc = bitcast i32 %reg to <2 x i16> 506 %load = load i8, i8* %in 507 %ext = zext i8 %load to i16 508 %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0 509 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 510 ret void 511} 512 513; GCN-LABEL: {{^}}load_flat_lo_v2i16_reglo_vreg_sexti8: 514; GCN: s_waitcnt 515; GFX900-NEXT: flat_load_sbyte_d16 v2, v[0:1] 516; GFX900-NEXT: s_waitcnt 517; GFX900-NEXT: global_store_dword v[0:1], v2 518; GFX900-NEXT: s_waitcnt 519; GFX900-NEXT: s_setpc_b64 520 521; GFX803: flat_load_sbyte v{{[0-9]+}} 522; GFX803: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 523 524; GFX906: flat_load_sbyte 525; GFX906: v_bfi_b32 526define void @load_flat_lo_v2i16_reglo_vreg_sexti8(i8* %in, i32 %reg) #0 { 527entry: 528 %reg.bc = bitcast i32 %reg to <2 x i16> 529 %load = load i8, i8* %in 530 %ext = sext i8 %load to i16 531 %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0 532 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 533 ret void 534} 535 536; GCN-LABEL: {{^}}load_flat_lo_v2f16_reglo_vreg_zexti8: 537; GCN: s_waitcnt 538; GFX900-NEXT: flat_load_ubyte_d16 v2, v[0:1] 539; GFX900-NEXT: s_waitcnt 540; GFX900-NEXT: global_store_dword v[0:1], v2 541; GFX900-NEXT: s_waitcnt 542; GFX900-NEXT: s_setpc_b64 543 544; GFX803: flat_load_ubyte [[LO:v[0-9]+]] 545; GFX803: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, v2 546; GFX803: s_mov_b32 [[MASK:s[0-9]+]], 0x5040c00 547; GFX803: v_perm_b32 [[RES:v[0-9]+]], [[HI]], [[LO]], [[MASK]] 548; GFX803: flat_store_dword v[0:1], [[RES]] 549 550; GFX906: flat_load_ubyte 551; GFX906: v_lshrrev_b32 552; GFX906: v_and_b32 553; GFX906: v_lshl_or_b32 554define void @load_flat_lo_v2f16_reglo_vreg_zexti8(i8* %in, i32 %reg) #0 { 555entry: 556 %reg.bc = bitcast i32 %reg to <2 x half> 557 %load = load i8, i8* %in 558 %ext = zext i8 %load to i16 559 %bitcast = bitcast i16 %ext to half 560 %build1 = insertelement <2 x half> %reg.bc, half %bitcast, i32 0 561 store <2 x half> %build1, <2 x half> addrspace(1)* undef 562 ret void 563} 564 565; GCN-LABEL: {{^}}load_flat_lo_v2f16_reglo_vreg_sexti8: 566; GCN: s_waitcnt 567; GFX900-NEXT: flat_load_sbyte_d16 v2, v[0:1] 568; GFX900-NEXT: s_waitcnt 569; GFX900-NEXT: global_store_dword v[0:1], v2 570; GFX900-NEXT: s_waitcnt 571; GFX900-NEXT: s_setpc_b64 572 573; GFX803: flat_load_sbyte v{{[0-9]+}} 574; GFX803: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 575 576; GFX906: flat_load_sbyte 577; GFX906: v_lshrrev_b32 578; GFX906: v_and_b32 579; GFX906: v_lshl_or_b32 580define void @load_flat_lo_v2f16_reglo_vreg_sexti8(i8* %in, i32 %reg) #0 { 581entry: 582 %reg.bc = bitcast i32 %reg to <2 x half> 583 %load = load i8, i8* %in 584 %ext = sext i8 %load to i16 585 %bitcast = bitcast i16 %ext to half 586 %build1 = insertelement <2 x half> %reg.bc, half %bitcast, i32 0 587 store <2 x half> %build1, <2 x half> addrspace(1)* undef 588 ret void 589} 590 591; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg: 592; GCN: s_waitcnt 593; GFX900: buffer_load_short_d16 v0, off, s[0:3], s32 offset:4094{{$}} 594; GFX900-NEXT: s_waitcnt 595; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 596; GFX900-NEXT: s_waitcnt 597; GFX900-NEXT: s_setpc_b64 598 599; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s32 offset:4094{{$}} 600define void @load_private_lo_v2i16_reglo_vreg(i16 addrspace(5)* byval %in, i32 %reg) #0 { 601entry: 602 %reg.bc = bitcast i32 %reg to <2 x i16> 603 %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i64 2047 604 %load = load i16, i16 addrspace(5)* %gep 605 %build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0 606 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 607 ret void 608} 609 610; GCN-LABEL: {{^}}load_private_lo_v2i16_reghi_vreg: 611; GCN: s_waitcnt 612; GFX900: buffer_load_ushort v1, off, s[0:3], s32 offset:4094{{$}} 613; GFX900-NEXT: s_waitcnt 614; GFX900: v_and_b32 615; GFX900: v_lshl_or_b32 616 617; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} 618; GFX900-NEXT: s_waitcnt 619; GFX900-NEXT: s_setpc_b64 620 621; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s32 offset:4094{{$}} 622define void @load_private_lo_v2i16_reghi_vreg(i16 addrspace(5)* byval %in, i16 %reg) #0 { 623entry: 624 %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i64 2047 625 %load = load i16, i16 addrspace(5)* %gep 626 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 1 627 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 0 628 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 629 ret void 630} 631 632; GCN-LABEL: {{^}}load_private_lo_v2f16_reglo_vreg: 633; GCN: s_waitcnt 634; GFX900: buffer_load_short_d16 v0, off, s[0:3], s32 offset:4094{{$}} 635; GFX900-NEXT: s_waitcnt 636; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 637; GFX900-NEXT: s_waitcnt 638; GFX900-NEXT: s_setpc_b64 639 640; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s32 offset:4094{{$}} 641define void @load_private_lo_v2f16_reglo_vreg(half addrspace(5)* byval %in, i32 %reg) #0 { 642entry: 643 %reg.bc = bitcast i32 %reg to <2 x half> 644 %gep = getelementptr inbounds half, half addrspace(5)* %in, i64 2047 645 %load = load half, half addrspace(5)* %gep 646 %build1 = insertelement <2 x half> %reg.bc, half %load, i32 0 647 store <2 x half> %build1, <2 x half> addrspace(1)* undef 648 ret void 649} 650 651; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_nooff: 652; GCN: s_waitcnt 653; GFX900-NEXT: buffer_load_short_d16 v1, off, s[0:3], s33 offset:4094{{$}} 654; GFX900-NEXT: s_waitcnt 655; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 656; GFX900-NEXT: s_waitcnt 657; GFX900-NEXT: s_setpc_b64 658 659; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s33 offset:4094{{$}} 660define void @load_private_lo_v2i16_reglo_vreg_nooff(i16 addrspace(5)* %in, i32 %reg) #0 { 661entry: 662 %reg.bc = bitcast i32 %reg to <2 x i16> 663 %load = load volatile i16, i16 addrspace(5)* inttoptr (i32 4094 to i16 addrspace(5)*) 664 %build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0 665 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 666 ret void 667} 668 669; GCN-LABEL: {{^}}load_private_lo_v2i16_reghi_vreg_nooff: 670; GCN: s_waitcnt 671; GFX900-NEXT: buffer_load_short_d16 v1, off, s[0:3], s33 offset:4094{{$}} 672; GFX900-NEXT: s_waitcnt 673; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 674; GFX900-NEXT: s_waitcnt 675; GFX900-NEXT: s_setpc_b64 676 677; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s33 offset:4094{{$}} 678define void @load_private_lo_v2i16_reghi_vreg_nooff(i16 addrspace(5)* %in, i32 %reg) #0 { 679entry: 680 %reg.bc = bitcast i32 %reg to <2 x i16> 681 %load = load volatile i16, i16 addrspace(5)* inttoptr (i32 4094 to i16 addrspace(5)*) 682 %build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0 683 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 684 ret void 685} 686 687; GCN-LABEL: {{^}}load_private_lo_v2f16_reglo_vreg_nooff: 688; GCN: s_waitcnt 689; GFX900-NEXT: buffer_load_short_d16 v1, off, s[0:3], s33 offset:4094{{$}} 690; GFX900-NEXT: s_waitcnt 691; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 692; GFX900-NEXT: s_waitcnt 693; GFX900-NEXT: s_setpc_b64 694 695; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s33 offset:4094{{$}} 696define void @load_private_lo_v2f16_reglo_vreg_nooff(half addrspace(5)* %in, i32 %reg) #0 { 697entry: 698 %reg.bc = bitcast i32 %reg to <2 x half> 699 %load = load volatile half, half addrspace(5)* inttoptr (i32 4094 to half addrspace(5)*) 700 %build1 = insertelement <2 x half> %reg.bc, half %load, i32 0 701 store <2 x half> %build1, <2 x half> addrspace(1)* undef 702 ret void 703} 704 705; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_zexti8: 706; GCN: s_waitcnt 707; GFX900: buffer_load_ubyte_d16 v0, off, s[0:3], s32 offset:4095{{$}} 708; GFX900-NEXT: s_waitcnt 709; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 710; GFX900-NEXT: s_waitcnt 711; GFX900-NEXT: s_setpc_b64 712 713; NO-D16-HI: buffer_load_ubyte v{{[0-9]+}}, off, s[0:3], s32 offset:4095{{$}} 714define void @load_private_lo_v2i16_reglo_vreg_zexti8(i8 addrspace(5)* byval %in, i32 %reg) #0 { 715entry: 716 %reg.bc = bitcast i32 %reg to <2 x i16> 717 %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095 718 %load = load i8, i8 addrspace(5)* %gep 719 %ext = zext i8 %load to i16 720 %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0 721 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 722 ret void 723} 724 725; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_sexti8: 726; GCN: s_waitcnt 727; GFX900: buffer_load_sbyte_d16 v0, off, s[0:3], s32 offset:4095{{$}} 728; GFX900-NEXT: s_waitcnt 729; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 730; GFX900-NEXT: s_waitcnt 731; GFX900-NEXT: s_setpc_b64 732 733; NO-D16-HI: buffer_load_sbyte v{{[0-9]+}}, off, s[0:3], s32 offset:4095{{$}} 734define void @load_private_lo_v2i16_reglo_vreg_sexti8(i8 addrspace(5)* byval %in, i32 %reg) #0 { 735entry: 736 %reg.bc = bitcast i32 %reg to <2 x i16> 737 %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095 738 %load = load i8, i8 addrspace(5)* %gep 739 %ext = sext i8 %load to i16 740 %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0 741 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 742 ret void 743} 744 745; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_nooff_zexti8: 746; GCN: s_waitcnt 747; GFX900-NEXT: buffer_load_ubyte_d16 v1, off, s[0:3], s33 offset:4094{{$}} 748; GFX900-NEXT: s_waitcnt 749; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 750; GFX900-NEXT: s_waitcnt 751; GFX900-NEXT: s_setpc_b64 752 753; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], s33 offset:4094{{$}} 754define void @load_private_lo_v2i16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, i32 %reg) #0 { 755entry: 756 %reg.bc = bitcast i32 %reg to <2 x i16> 757 %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*) 758 %ext = zext i8 %load to i16 759 %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0 760 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 761 ret void 762} 763 764; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_nooff_sexti8: 765; GCN: s_waitcnt 766; GFX900-NEXT: buffer_load_sbyte_d16 v1, off, s[0:3], s33 offset:4094{{$}} 767; GFX900-NEXT: s_waitcnt 768; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 769; GFX900-NEXT: s_waitcnt 770; GFX900-NEXT: s_setpc_b64 771 772; NO-D16-HI: buffer_load_sbyte v0, off, s[0:3], s33 offset:4094{{$}} 773define void @load_private_lo_v2i16_reglo_vreg_nooff_sexti8(i8 addrspace(5)* %in, i32 %reg) #0 { 774entry: 775 %reg.bc = bitcast i32 %reg to <2 x i16> 776 %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*) 777 %ext = sext i8 %load to i16 778 %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0 779 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 780 ret void 781} 782 783; GCN-LABEL: {{^}}load_private_lo_v2f16_reglo_vreg_nooff_zexti8: 784; GCN: s_waitcnt 785; GFX900-NEXT: buffer_load_ubyte_d16 v1, off, s[0:3], s33 offset:4094{{$}} 786; GFX900-NEXT: s_waitcnt 787; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 788; GFX900-NEXT: s_waitcnt 789; GFX900-NEXT: s_setpc_b64 790 791; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], s33 offset:4094{{$}} 792define void @load_private_lo_v2f16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, i32 %reg) #0 { 793entry: 794 %reg.bc = bitcast i32 %reg to <2 x half> 795 %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*) 796 %ext = zext i8 %load to i16 797 %bc.ext = bitcast i16 %ext to half 798 %build1 = insertelement <2 x half> %reg.bc, half %bc.ext, i32 0 799 store <2 x half> %build1, <2 x half> addrspace(1)* undef 800 ret void 801} 802 803; GCN-LABEL: {{^}}load_constant_lo_v2i16_reglo_vreg: 804; GCN: s_waitcnt 805; GFX900-NEXT: global_load_short_d16 v2, v[0:1], off offset:-4094 806; GFX900-NEXT: s_waitcnt 807; GFX900-NEXT: global_store_dword 808; GFX900-NEXT: s_waitcnt 809; GFX900-NEXT: s_setpc_b64 810 811; GFX803: flat_load_ushort 812 813; GFX906: global_load_ushort 814define void @load_constant_lo_v2i16_reglo_vreg(i16 addrspace(4)* %in, i32 %reg) #0 { 815entry: 816 %reg.bc = bitcast i32 %reg to <2 x i16> 817 %gep = getelementptr inbounds i16, i16 addrspace(4)* %in, i64 -2047 818 %load = load i16, i16 addrspace(4)* %gep 819 %build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0 820 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 821 ret void 822} 823 824; GCN-LABEL: load_constant_lo_v2f16_reglo_vreg 825; GCN: s_waitcnt 826; GFX900-NEXT: global_load_short_d16 v2, v[0:1], off offset:-4094 827; GFX900-NEXT: s_waitcnt 828; GFX900-NEXT: global_store_dword 829; GFX900-NEXT: s_waitcnt 830; GFX900-NEXT: s_setpc_b64 831 832; GFX803: flat_load_ushort 833 834; GFX906: global_load_ushort 835define void @load_constant_lo_v2f16_reglo_vreg(half addrspace(4)* %in, i32 %reg) #0 { 836entry: 837 %reg.bc = bitcast i32 %reg to <2 x half> 838 %gep = getelementptr inbounds half, half addrspace(4)* %in, i64 -2047 839 %load = load half, half addrspace(4)* %gep 840 %build1 = insertelement <2 x half> %reg.bc, half %load, i32 0 841 store <2 x half> %build1, <2 x half> addrspace(1)* undef 842 ret void 843} 844 845; GCN-LABEL: {{^}}load_constant_lo_v2f16_reglo_vreg_zexti8: 846; GCN: s_waitcnt 847; GFX900-NEXT: global_load_ubyte_d16 v2, v[0:1], off offset:-4095 848; GFX900-NEXT: s_waitcnt 849; GFX900-NEXT: global_store_dword 850; GFX900-NEXT: s_waitcnt 851; GFX900-NEXT: s_setpc_b64 852 853; GFX906: global_load_ubyte v0, v[0:1], off offset:-4095 854; GFX906: v_and_b32_e32 855; GFX906: v_lshl_or_b32 856 857; GFX803: flat_load_ubyte 858define void @load_constant_lo_v2f16_reglo_vreg_zexti8(i8 addrspace(4)* %in, i32 %reg) #0 { 859entry: 860 %reg.bc = bitcast i32 %reg to <2 x half> 861 %gep = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 -4095 862 %load = load i8, i8 addrspace(4)* %gep 863 %ext = zext i8 %load to i16 864 %bitcast = bitcast i16 %ext to half 865 %build1 = insertelement <2 x half> %reg.bc, half %bitcast, i32 0 866 store <2 x half> %build1, <2 x half> addrspace(1)* undef 867 ret void 868} 869 870; GCN-LABEL: {{^}}load_constant_lo_v2f16_reglo_vreg_sexti8: 871; GCN: s_waitcnt 872; GFX900-NEXT: global_load_sbyte_d16 v2, v[0:1], off offset:-4095 873; GFX900-NEXT: s_waitcnt 874; GFX900-NEXT: global_store_dword 875; GFX900-NEXT: s_waitcnt 876; GFX900-NEXT: s_setpc_b64 877 878; GFX906: global_load_sbyte v0, v[0:1], off offset:-4095 879; GFX906: v_lshrrev_b32 880; GFX906: v_and_b32 881; GFX906: v_lshl_or_b32 882 883; GFX803: flat_load_sbyte 884define void @load_constant_lo_v2f16_reglo_vreg_sexti8(i8 addrspace(4)* %in, i32 %reg) #0 { 885entry: 886 %reg.bc = bitcast i32 %reg to <2 x half> 887 %gep = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 -4095 888 %load = load i8, i8 addrspace(4)* %gep 889 %ext = sext i8 %load to i16 890 %bitcast = bitcast i16 %ext to half 891 %build1 = insertelement <2 x half> %reg.bc, half %bitcast, i32 0 892 store <2 x half> %build1, <2 x half> addrspace(1)* undef 893 ret void 894} 895 896; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_to_offset: 897; GFX900: buffer_store_dword 898; GFX900-NEXT: buffer_load_short_d16 v0, off, s[0:3], s32 offset:4094 899 900; NO-D16-HI: buffer_load_ushort v 901define void @load_private_lo_v2i16_reglo_vreg_to_offset(i32 %reg) #0 { 902entry: 903 %obj0 = alloca [10 x i32], align 4, addrspace(5) 904 %obj1 = alloca [4096 x i16], align 2, addrspace(5) 905 %reg.bc = bitcast i32 %reg to <2 x i16> 906 %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)* 907 store volatile i32 123, i32 addrspace(5)* %bc 908 %gep = getelementptr inbounds [4096 x i16], [4096 x i16] addrspace(5)* %obj1, i32 0, i32 2027 909 %load = load volatile i16, i16 addrspace(5)* %gep 910 %build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0 911 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 912 ret void 913} 914 915; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_sexti8_to_offset: 916; GFX900: buffer_store_dword 917; GFX900-NEXT: buffer_load_sbyte_d16 v0, off, s[0:3], s32 offset:4095 918 919; NO-D16-HI: buffer_load_sbyte v 920define void @load_private_lo_v2i16_reglo_vreg_sexti8_to_offset(i32 %reg) #0 { 921entry: 922 %obj0 = alloca [10 x i32], align 4, addrspace(5) 923 %obj1 = alloca [4096 x i8], align 2, addrspace(5) 924 %reg.bc = bitcast i32 %reg to <2 x i16> 925 %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)* 926 store volatile i32 123, i32 addrspace(5)* %bc 927 %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4055 928 %load = load volatile i8, i8 addrspace(5)* %gep 929 %load.ext = sext i8 %load to i16 930 %build1 = insertelement <2 x i16> %reg.bc, i16 %load.ext, i32 0 931 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 932 ret void 933} 934 935; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_zexti8_to_offset: 936; GFX900: buffer_store_dword 937; GFX900-NEXT: buffer_load_ubyte_d16 v0, off, s[0:3], s32 offset:4095 938 939; NO-D16-HI: buffer_load_ubyte v 940define void @load_private_lo_v2i16_reglo_vreg_zexti8_to_offset(i32 %reg) #0 { 941entry: 942 %obj0 = alloca [10 x i32], align 4, addrspace(5) 943 %obj1 = alloca [4096 x i8], align 2, addrspace(5) 944 %reg.bc = bitcast i32 %reg to <2 x i16> 945 %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)* 946 store volatile i32 123, i32 addrspace(5)* %bc 947 %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4055 948 %load = load volatile i8, i8 addrspace(5)* %gep 949 %load.ext = zext i8 %load to i16 950 %build1 = insertelement <2 x i16> %reg.bc, i16 %load.ext, i32 0 951 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 952 ret void 953} 954 955; GCN-LABEL: {{^}}load_private_lo_v2f16_reglo_vreg_sexti8_to_offset: 956; GFX900: buffer_store_dword 957; GFX900-NEXT: buffer_load_sbyte_d16 v0, off, s[0:3], s32 offset:4095 958 959; NO-D16-HI: buffer_load_sbyte v 960define void @load_private_lo_v2f16_reglo_vreg_sexti8_to_offset(i32 %reg) #0 { 961entry: 962 %obj0 = alloca [10 x i32], align 4, addrspace(5) 963 %obj1 = alloca [4096 x i8], align 2, addrspace(5) 964 %reg.bc = bitcast i32 %reg to <2 x half> 965 %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)* 966 store volatile i32 123, i32 addrspace(5)* %bc 967 %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4055 968 %load = load volatile i8, i8 addrspace(5)* %gep 969 %load.ext = sext i8 %load to i16 970 %bitcast = bitcast i16 %load.ext to half 971 %build1 = insertelement <2 x half> %reg.bc, half %bitcast, i32 0 972 store <2 x half> %build1, <2 x half> addrspace(1)* undef 973 ret void 974} 975 976; GCN-LABEL: {{^}}load_private_lo_v2f16_reglo_vreg_zexti8_to_offset: 977; GFX900: buffer_store_dword 978; GFX900-NEXT: buffer_load_ubyte_d16 v0, off, s[0:3], s32 offset:4095 979 980; NO-D16-HI: buffer_load_ubyte v 981define void @load_private_lo_v2f16_reglo_vreg_zexti8_to_offset(i32 %reg) #0 { 982entry: 983 %obj0 = alloca [10 x i32], align 4, addrspace(5) 984 %obj1 = alloca [4096 x i8], align 2, addrspace(5) 985 %reg.bc = bitcast i32 %reg to <2 x half> 986 %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)* 987 store volatile i32 123, i32 addrspace(5)* %bc 988 %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4055 989 %load = load volatile i8, i8 addrspace(5)* %gep 990 %load.ext = zext i8 %load to i16 991 %bitcast = bitcast i16 %load.ext to half 992 %build1 = insertelement <2 x half> %reg.bc, half %bitcast, i32 0 993 store <2 x half> %build1, <2 x half> addrspace(1)* undef 994 ret void 995} 996 997attributes #0 = { nounwind } 998