1; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900 %s 2; RUN: llc -march=amdgcn -mcpu=gfx906 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX906,NO-D16-HI %s 3; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX803,NO-D16-HI %s 4 5; GCN-LABEL: {{^}}load_local_lo_hi_v2i16_multi_use_lo: 6; GFX900: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7; GFX900-NEXT: ds_read_u16 v2, v0 8; GFX900-NEXT: v_mov_b32_e32 v3, 0 9; GFX900-NEXT: s_waitcnt lgkmcnt(0) 10; GFX900-NEXT: v_mov_b32_e32 v1, v2 11; GFX900-NEXT: ds_read_u16_d16_hi v1, v0 offset:16 12; GFX900-NEXT: ds_write_b16 v3, v2 13; GFX900-NEXT: s_waitcnt lgkmcnt(1) 14; GFX900-NEXT: v_mov_b32_e32 v0, v1 15; GFX900-NEXT: s_waitcnt lgkmcnt(0) 16; GFX900-NEXT: s_setpc_b64 s[30:31] 17define <2 x i16> @load_local_lo_hi_v2i16_multi_use_lo(i16 addrspace(3)* noalias %in) #0 { 18entry: 19 %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 8 20 %load.lo = load i16, i16 addrspace(3)* %in 21 %load.hi = load i16, i16 addrspace(3)* %gep 22 store i16 %load.lo, i16 addrspace(3)* null 23 %build0 = insertelement <2 x i16> undef, i16 %load.lo, i32 0 24 %build1 = insertelement <2 x i16> %build0, i16 %load.hi, i32 1 25 ret <2 x i16> %build1 26} 27 28; GCN-LABEL: {{^}}load_local_lo_hi_v2i16_multi_use_hi: 29; GFX900: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 30; GFX900-NEXT: ds_read_u16 v1, v0 31; GFX900-NEXT: ds_read_u16 v0, v0 offset:16 32; GFX900-NEXT: v_mov_b32_e32 v2, 0 33; GFX900-NEXT: s_waitcnt lgkmcnt(1) 34; GFX900-NEXT: v_and_b32_e32 v1, 0xffff, v1 35; GFX900-NEXT: s_waitcnt lgkmcnt(0) 36; GFX900-NEXT: ds_write_b16 v2, v0 37; GFX900-NEXT: v_lshl_or_b32 v0, v0, 16, v1 38; GFX900-NEXT: s_waitcnt lgkmcnt(0) 39; GFX900-NEXT: s_setpc_b64 s[30:31] 40define <2 x i16> @load_local_lo_hi_v2i16_multi_use_hi(i16 addrspace(3)* noalias %in) #0 { 41entry: 42 %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 8 43 %load.lo = load i16, i16 addrspace(3)* %in 44 %load.hi = load i16, i16 addrspace(3)* %gep 45 store i16 %load.hi, i16 addrspace(3)* null 46 %build0 = insertelement <2 x i16> undef, i16 %load.lo, i32 0 47 %build1 = insertelement <2 x i16> %build0, i16 %load.hi, i32 1 48 ret <2 x i16> %build1 49} 50 51; GCN-LABEL: {{^}}load_local_lo_hi_v2i16_multi_use_lohi: 52; GFX900: ds_read_u16 v3, v0 53; GFX900-NEXT: ds_read_u16 v0, v0 offset:16 54; GFX900-NEXT: s_waitcnt lgkmcnt(1) 55; GFX900-NEXT: ds_write_b16 v1, v3 56; GFX900-NEXT: s_waitcnt lgkmcnt(1) 57; GFX900-NEXT: ds_write_b16 v2, v0 58; GFX900-NEXT: v_and_b32_e32 v1, 0xffff, v3 59; GFX900-NEXT: v_lshl_or_b32 v0, v0, 16, v1 60; GFX900-NEXT: s_waitcnt lgkmcnt(0) 61; GFX900-NEXT: s_setpc_b64 s[30:31] 62define <2 x i16> @load_local_lo_hi_v2i16_multi_use_lohi(i16 addrspace(3)* noalias %in, i16 addrspace(3)* noalias %out0, i16 addrspace(3)* noalias %out1) #0 { 63entry: 64 %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 8 65 %load.lo = load i16, i16 addrspace(3)* %in 66 %load.hi = load i16, i16 addrspace(3)* %gep 67 store i16 %load.lo, i16 addrspace(3)* %out0 68 store i16 %load.hi, i16 addrspace(3)* %out1 69 %build0 = insertelement <2 x i16> undef, i16 %load.lo, i32 0 70 %build1 = insertelement <2 x i16> %build0, i16 %load.hi, i32 1 71 ret <2 x i16> %build1 72} 73 74; GCN-LABEL: {{^}}load_local_hi_v2i16_undeflo: 75; GCN: s_waitcnt 76; GFX900-NEXT: ds_read_u16_d16_hi v0, v0 77; GFX900-NEXT: s_waitcnt 78; GFX900-NEXT: s_setpc_b64 79 80; NO-D16-HI: ds_read_u16 v 81define <2 x i16> @load_local_hi_v2i16_undeflo(i16 addrspace(3)* %in) #0 { 82entry: 83 %load = load i16, i16 addrspace(3)* %in 84 %build = insertelement <2 x i16> undef, i16 %load, i32 1 85 ret <2 x i16> %build 86} 87 88; GCN-LABEL: {{^}}load_local_hi_v2i16_reglo: 89; GCN: s_waitcnt 90; GFX900-NEXT: ds_read_u16_d16_hi v1, v0 91; GFX900-NEXT: s_waitcnt 92; GFX900-NEXT: v_mov_b32_e32 v0, v1 93; GFX900-NEXT: s_setpc_b64 94 95; NO-D16-HI: ds_read_u16 v 96define <2 x i16> @load_local_hi_v2i16_reglo(i16 addrspace(3)* %in, i16 %reg) #0 { 97entry: 98 %load = load i16, i16 addrspace(3)* %in 99 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 100 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1 101 ret <2 x i16> %build1 102} 103 104; Show that we get reasonable regalloc without physreg constraints. 105; GCN-LABEL: {{^}}load_local_hi_v2i16_reglo_vreg: 106; GCN: s_waitcnt 107; GFX900-NEXT: ds_read_u16_d16_hi v1, v0 108; GFX900-NEXT: s_waitcnt 109; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}} 110; GFX900-NEXT: s_waitcnt 111; GFX900-NEXT: s_setpc_b64 112 113; NO-D16-HI: ds_read_u16 v 114define void @load_local_hi_v2i16_reglo_vreg(i16 addrspace(3)* %in, i16 %reg) #0 { 115entry: 116 %load = load i16, i16 addrspace(3)* %in 117 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 118 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1 119 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 120 ret void 121} 122 123; GCN-LABEL: {{^}}load_local_hi_v2i16_zerolo: 124; GCN: s_waitcnt 125; GFX900-NEXT: v_mov_b32_e32 v1, 0 126; GFX900-NEXT: ds_read_u16_d16_hi v1, v0 127; GFX900-NEXT: s_waitcnt 128; GFX900-NEXT: v_mov_b32_e32 v0, v1 129; GFX900-NEXT: s_setpc_b64 130 131; NO-D16-HI: ds_read_u16 v 132define <2 x i16> @load_local_hi_v2i16_zerolo(i16 addrspace(3)* %in) #0 { 133entry: 134 %load = load i16, i16 addrspace(3)* %in 135 %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 1 136 ret <2 x i16> %build 137} 138 139; FIXME: Remove m0 initialization 140; GCN-LABEL: {{^}}load_local_hi_v2i16_zerolo_shift: 141; GCN: s_waitcnt 142; GFX900-NEXT: ds_read_u16 v0, v0 143; GFX900-NEXT: s_waitcnt lgkmcnt(0) 144; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 145; GFX900-NEXT: s_setpc_b64 146 147; NO-D16-HI: ds_read_u16 v 148; NO-D16-HI: v_lshlrev_b32_e32 v0, 16, v0 149define i32 @load_local_hi_v2i16_zerolo_shift(i16 addrspace(3)* %in) #0 { 150entry: 151 %load = load i16, i16 addrspace(3)* %in 152 %zext = zext i16 %load to i32 153 %shift = shl i32 %zext, 16 154 ret i32 %shift 155} 156 157; GCN-LABEL: {{^}}load_local_hi_v2f16_reglo_vreg: 158; GCN: s_waitcnt 159; GFX900-NEXT: ds_read_u16_d16_hi v1, v0 160; GFX900-NEXT: s_waitcnt 161; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}} 162; GFX900-NEXT: s_waitcnt 163; GFX900-NEXT: s_setpc_b64 164 165; NO-D16-HI: ds_read_u16 v 166define void @load_local_hi_v2f16_reglo_vreg(half addrspace(3)* %in, half %reg) #0 { 167entry: 168 %load = load half, half addrspace(3)* %in 169 %build0 = insertelement <2 x half> undef, half %reg, i32 0 170 %build1 = insertelement <2 x half> %build0, half %load, i32 1 171 store <2 x half> %build1, <2 x half> addrspace(1)* undef 172 ret void 173} 174 175; GCN-LABEL: {{^}}load_local_hi_v2i16_reglo_vreg_zexti8: 176; GCN: s_waitcnt 177; GFX900-NEXT: ds_read_u8_d16_hi v1, v0 178; GFX900-NEXT: s_waitcnt 179; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}} 180; GFX900-NEXT: s_waitcnt 181; GFX900-NEXT: s_setpc_b64 182 183; NO-D16-HI: ds_read_u8 v 184define void @load_local_hi_v2i16_reglo_vreg_zexti8(i8 addrspace(3)* %in, i16 %reg) #0 { 185entry: 186 %load = load i8, i8 addrspace(3)* %in 187 %ext = zext i8 %load to i16 188 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 189 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 190 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 191 ret void 192} 193 194; GCN-LABEL: {{^}}load_local_hi_v2i16_reglo_vreg_sexti8: 195; GCN: s_waitcnt 196; GFX900-NEXT: ds_read_i8_d16_hi v1, v0 197; GFX900-NEXT: s_waitcnt 198; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}} 199; GFX900-NEXT: s_waitcnt 200; GFX900-NEXT: s_setpc_b64 201 202; NO-D16-HI: ds_read_i8 v 203define void @load_local_hi_v2i16_reglo_vreg_sexti8(i8 addrspace(3)* %in, i16 %reg) #0 { 204entry: 205 %load = load i8, i8 addrspace(3)* %in 206 %ext = sext i8 %load to i16 207 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 208 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 209 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 210 ret void 211} 212 213; GCN-LABEL: {{^}}load_local_hi_v2f16_reglo_vreg_zexti8: 214; GCN: s_waitcnt 215; GFX900-NEXT: ds_read_u8_d16_hi v1, v0 216; GFX900-NEXT: s_waitcnt 217; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}} 218; GFX900-NEXT: s_waitcnt 219; GFX900-NEXT: s_setpc_b64 220 221; NO-D16-HI: ds_read_u8 v 222define void @load_local_hi_v2f16_reglo_vreg_zexti8(i8 addrspace(3)* %in, half %reg) #0 { 223entry: 224 %load = load i8, i8 addrspace(3)* %in 225 %ext = zext i8 %load to i16 226 %bitcast = bitcast i16 %ext to half 227 228 %build0 = insertelement <2 x half> undef, half %reg, i32 0 229 %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1 230 store <2 x half> %build1, <2 x half> addrspace(1)* undef 231 ret void 232} 233 234; GCN-LABEL: {{^}}load_local_hi_v2f16_reglo_vreg_sexti8: 235; GCN: s_waitcnt 236; GFX900-NEXT: ds_read_i8_d16_hi v1, v0 237; GFX900-NEXT: s_waitcnt 238; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}} 239; GFX900-NEXT: s_waitcnt 240; GFX900-NEXT: s_setpc_b64 241 242; NO-D16-HI: ds_read_i8 v 243define void @load_local_hi_v2f16_reglo_vreg_sexti8(i8 addrspace(3)* %in, half %reg) #0 { 244entry: 245 %load = load i8, i8 addrspace(3)* %in 246 %ext = sext i8 %load to i16 247 %bitcast = bitcast i16 %ext to half 248 249 %build0 = insertelement <2 x half> undef, half %reg, i32 0 250 %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1 251 store <2 x half> %build1, <2 x half> addrspace(1)* undef 252 ret void 253} 254 255; GCN-LABEL: {{^}}load_global_hi_v2i16_reglo_vreg: 256; GCN: s_waitcnt 257; GFX900-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094 258; GFX900-NEXT: s_waitcnt 259; GFX900-NEXT: global_store_dword 260; GFX900-NEXT: s_waitcnt 261; GFX900-NEXT: s_setpc_b64 262define void @load_global_hi_v2i16_reglo_vreg(i16 addrspace(1)* %in, i16 %reg) #0 { 263entry: 264 %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 -2047 265 %load = load i16, i16 addrspace(1)* %gep 266 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 267 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1 268 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 269 ret void 270} 271 272; GCN-LABEL: {{^}}load_global_hi_v2f16_reglo_vreg: 273; GCN: s_waitcnt 274; GFX900-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094 275; GFX900-NEXT: s_waitcnt 276; GFX900-NEXT: global_store_dword 277; GFX900-NEXT: s_waitcnt 278; GFX900-NEXT: s_setpc_b64 279define void @load_global_hi_v2f16_reglo_vreg(half addrspace(1)* %in, half %reg) #0 { 280entry: 281 %gep = getelementptr inbounds half, half addrspace(1)* %in, i64 -2047 282 %load = load half, half addrspace(1)* %gep 283 %build0 = insertelement <2 x half> undef, half %reg, i32 0 284 %build1 = insertelement <2 x half> %build0, half %load, i32 1 285 store <2 x half> %build1, <2 x half> addrspace(1)* undef 286 ret void 287} 288 289; GCN-LABEL: {{^}}load_global_hi_v2i16_reglo_vreg_zexti8: 290; GCN: s_waitcnt 291; GFX900-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:-4095 292; GFX900-NEXT: s_waitcnt 293; GFX900-NEXT: global_store_dword 294; GFX900-NEXT: s_waitcnt 295; GFX900-NEXT: s_setpc_b64 296define void @load_global_hi_v2i16_reglo_vreg_zexti8(i8 addrspace(1)* %in, i16 %reg) #0 { 297entry: 298 %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095 299 %load = load i8, i8 addrspace(1)* %gep 300 %ext = zext i8 %load to i16 301 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 302 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 303 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 304 ret void 305} 306 307; GCN-LABEL: {{^}}load_global_hi_v2i16_reglo_vreg_sexti8: 308; GCN: s_waitcnt 309; GFX900-NEXT: global_load_sbyte_d16_hi v2, v[0:1], off offset:-4095 310; GFX900-NEXT: s_waitcnt 311; GFX900-NEXT: global_store_dword 312; GFX900-NEXT: s_waitcnt 313; GFX900-NEXT: s_setpc_b64 314define void @load_global_hi_v2i16_reglo_vreg_sexti8(i8 addrspace(1)* %in, i16 %reg) #0 { 315entry: 316 %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095 317 %load = load i8, i8 addrspace(1)* %gep 318 %ext = sext i8 %load to i16 319 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 320 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 321 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 322 ret void 323} 324 325; GCN-LABEL: {{^}}load_global_hi_v2f16_reglo_vreg_sexti8: 326; GCN: s_waitcnt 327; GFX900-NEXT: global_load_sbyte_d16_hi v2, v[0:1], off offset:-4095 328; GFX900-NEXT: s_waitcnt 329; GFX900-NEXT: global_store_dword 330; GFX900-NEXT: s_waitcnt 331; GFX900-NEXT: s_setpc_b64 332define void @load_global_hi_v2f16_reglo_vreg_sexti8(i8 addrspace(1)* %in, half %reg) #0 { 333entry: 334 %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095 335 %load = load i8, i8 addrspace(1)* %gep 336 %ext = sext i8 %load to i16 337 %bitcast = bitcast i16 %ext to half 338 %build0 = insertelement <2 x half> undef, half %reg, i32 0 339 %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1 340 store <2 x half> %build1, <2 x half> addrspace(1)* undef 341 ret void 342} 343 344; GCN-LABEL: {{^}}load_global_hi_v2f16_reglo_vreg_zexti8: 345; GCN: s_waitcnt 346; GFX900-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:-4095 347; GFX900-NEXT: s_waitcnt 348; GFX900-NEXT: global_store_dword 349; GFX900-NEXT: s_waitcnt 350; GFX900-NEXT: s_setpc_b64 351define void @load_global_hi_v2f16_reglo_vreg_zexti8(i8 addrspace(1)* %in, half %reg) #0 { 352entry: 353 %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095 354 %load = load i8, i8 addrspace(1)* %gep 355 %ext = zext i8 %load to i16 356 %bitcast = bitcast i16 %ext to half 357 %build0 = insertelement <2 x half> undef, half %reg, i32 0 358 %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1 359 store <2 x half> %build1, <2 x half> addrspace(1)* undef 360 ret void 361} 362 363; GCN-LABEL: load_flat_hi_v2i16_reglo_vreg: 364; GCN: s_waitcnt 365; GFX900-NEXT: flat_load_short_d16_hi v2, v[0:1] 366; GFX900-NEXT: s_waitcnt 367; GFX900-NEXT: global_store_dword v[0:1], v2 368; GFX900-NEXT: s_waitcnt 369; GFX900-NEXT: s_setpc_b64 370 371; NO-D16-HI: flat_load_ushort v{{[0-9]+}} 372; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, 373; GFX803: v_or_b32_sdwa 374; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16, 375define void @load_flat_hi_v2i16_reglo_vreg(i16* %in, i16 %reg) #0 { 376entry: 377 %load = load i16, i16* %in 378 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 379 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1 380 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 381 ret void 382} 383 384; GCN-LABEL: {{^}}load_flat_hi_v2f16_reglo_vreg: 385; GCN: s_waitcnt 386; GFX900-NEXT: flat_load_short_d16_hi v2, v[0:1] 387; GFX900-NEXT: s_waitcnt 388; GFX900-NEXT: global_store_dword v[0:1], v2 389; GFX900-NEXT: s_waitcnt 390; GFX900-NEXT: s_setpc_b64 391 392; NO-D16-HI: flat_load_ushort v{{[0-9]+}} 393; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, 394; GFX803: v_or_b32_sdwa 395; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16, 396define void @load_flat_hi_v2f16_reglo_vreg(half* %in, half %reg) #0 { 397entry: 398 %load = load half, half* %in 399 %build0 = insertelement <2 x half> undef, half %reg, i32 0 400 %build1 = insertelement <2 x half> %build0, half %load, i32 1 401 store <2 x half> %build1, <2 x half> addrspace(1)* undef 402 ret void 403} 404 405; GCN-LABEL: {{^}}load_flat_hi_v2i16_reglo_vreg_zexti8: 406; GCN: s_waitcnt 407; GFX900-NEXT: flat_load_ubyte_d16_hi v2, v[0:1] 408; GFX900-NEXT: s_waitcnt 409; GFX900-NEXT: global_store_dword v[0:1], v2 410; GFX900-NEXT: s_waitcnt 411; GFX900-NEXT: s_setpc_b64 412 413; NO-D16-HI: flat_load_ubyte v{{[0-9]+}} 414; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, 415; GFX803: v_or_b32_sdwa 416; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16, 417define void @load_flat_hi_v2i16_reglo_vreg_zexti8(i8* %in, i16 %reg) #0 { 418entry: 419 %load = load i8, i8* %in 420 %ext = zext i8 %load to i16 421 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 422 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 423 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 424 ret void 425} 426 427; GCN-LABEL: {{^}}load_flat_hi_v2i16_reglo_vreg_sexti8: 428; GCN: s_waitcnt 429; GFX900-NEXT: flat_load_sbyte_d16_hi v2, v[0:1] 430; GFX900-NEXT: s_waitcnt 431; GFX900-NEXT: global_store_dword v[0:1], v2 432; GFX900-NEXT: s_waitcnt 433; GFX900-NEXT: s_setpc_b64 434 435; NO-D16-HI: flat_load_sbyte v{{[0-9]+}} 436; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, 437; GFX803: v_or_b32_sdwa 438; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16, 439define void @load_flat_hi_v2i16_reglo_vreg_sexti8(i8* %in, i16 %reg) #0 { 440entry: 441 %load = load i8, i8* %in 442 %ext = sext i8 %load to i16 443 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 444 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 445 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 446 ret void 447} 448 449; GCN-LABEL: {{^}}load_flat_hi_v2f16_reglo_vreg_zexti8: 450; GCN: s_waitcnt 451; GFX900-NEXT: flat_load_ubyte_d16_hi v2, v[0:1] 452; GFX900-NEXT: s_waitcnt 453; GFX900-NEXT: global_store_dword v[0:1], v2 454; GFX900-NEXT: s_waitcnt 455; GFX900-NEXT: s_setpc_b64 456 457; NO-D16-HI: flat_load_ubyte v{{[0-9]+}} 458; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, 459; GFX803: v_or_b32_sdwa 460; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16, 461define void @load_flat_hi_v2f16_reglo_vreg_zexti8(i8* %in, half %reg) #0 { 462entry: 463 %load = load i8, i8* %in 464 %ext = zext i8 %load to i16 465 %bitcast = bitcast i16 %ext to half 466 %build0 = insertelement <2 x half> undef, half %reg, i32 0 467 %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1 468 store <2 x half> %build1, <2 x half> addrspace(1)* undef 469 ret void 470} 471 472; GCN-LABEL: {{^}}load_flat_hi_v2f16_reglo_vreg_sexti8: 473; GCN: s_waitcnt 474; GFX900-NEXT: flat_load_sbyte_d16_hi v2, v[0:1] 475; GFX900-NEXT: s_waitcnt 476; GFX900-NEXT: global_store_dword v[0:1], v2 477; GFX900-NEXT: s_waitcnt 478; GFX900-NEXT: s_setpc_b64 479 480; NO-D16-HI: flat_load_sbyte v{{[0-9]+}} 481; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, 482; GFX803: v_or_b32_sdwa 483; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16, 484define void @load_flat_hi_v2f16_reglo_vreg_sexti8(i8* %in, half %reg) #0 { 485entry: 486 %load = load i8, i8* %in 487 %ext = sext i8 %load to i16 488 %bitcast = bitcast i16 %ext to half 489 %build0 = insertelement <2 x half> undef, half %reg, i32 0 490 %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1 491 store <2 x half> %build1, <2 x half> addrspace(1)* undef 492 ret void 493} 494 495; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg: 496; GCN: s_waitcnt 497; GFX900: buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:4094{{$}} 498; GFX900-NEXT: s_waitcnt 499; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 500; GFX900-NEXT: s_waitcnt 501; GFX900-NEXT: s_setpc_b64 502 503; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s32 offset:4094{{$}} 504define void @load_private_hi_v2i16_reglo_vreg(i16 addrspace(5)* byval %in, i16 %reg) #0 { 505entry: 506 %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i64 2047 507 %load = load i16, i16 addrspace(5)* %gep 508 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 509 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1 510 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 511 ret void 512} 513 514; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg: 515; GCN: s_waitcnt 516; GFX900: buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:4094{{$}} 517; GFX900-NEXT: s_waitcnt 518; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 519; GFX900-NEXT: s_waitcnt 520; GFX900-NEXT: s_setpc_b64 521 522; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s32 offset:4094{{$}} 523define void @load_private_hi_v2f16_reglo_vreg(half addrspace(5)* byval %in, half %reg) #0 { 524entry: 525 %gep = getelementptr inbounds half, half addrspace(5)* %in, i64 2047 526 %load = load half, half addrspace(5)* %gep 527 %build0 = insertelement <2 x half> undef, half %reg, i32 0 528 %build1 = insertelement <2 x half> %build0, half %load, i32 1 529 store <2 x half> %build1, <2 x half> addrspace(1)* undef 530 ret void 531} 532 533; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff: 534; GCN: s_waitcnt 535; GFX900: buffer_load_short_d16_hi v0, off, s[0:3], s33 offset:4094{{$}} 536; GFX900: s_waitcnt 537; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 538; GFX900-NEXT: s_waitcnt 539; GFX900-NEXT: s_setpc_b64 540 541; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s33 offset:4094{{$}} 542define void @load_private_hi_v2i16_reglo_vreg_nooff(i16 addrspace(5)* byval %in, i16 %reg) #0 { 543entry: 544 %load = load volatile i16, i16 addrspace(5)* inttoptr (i32 4094 to i16 addrspace(5)*) 545 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 546 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1 547 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 548 ret void 549} 550 551; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_nooff: 552; GCN: s_waitcnt 553; GFX900-NEXT: buffer_load_short_d16_hi v1, off, s[0:3], s33 offset:4094{{$}} 554; GFX900-NEXT: s_waitcnt 555; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 556; GFX900-NEXT: s_waitcnt 557; GFX900-NEXT: s_setpc_b64 558 559; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s33 offset:4094{{$}} 560define void @load_private_hi_v2f16_reglo_vreg_nooff(half addrspace(5)* %in, half %reg) #0 { 561entry: 562 %load = load volatile half, half addrspace(5)* inttoptr (i32 4094 to half addrspace(5)*) 563 %build0 = insertelement <2 x half> undef, half %reg, i32 0 564 %build1 = insertelement <2 x half> %build0, half %load, i32 1 565 store <2 x half> %build1, <2 x half> addrspace(1)* undef 566 ret void 567} 568 569; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_zexti8: 570; GCN: s_waitcnt 571; GFX900: buffer_load_ubyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}} 572; GFX900-NEXT: s_waitcnt 573; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 574; GFX900-NEXT: s_waitcnt 575; GFX900-NEXT: s_setpc_b64 576 577; NO-D16-HI: buffer_load_ubyte v{{[0-9]+}}, off, s[0:3], s32 offset:4095{{$}} 578define void @load_private_hi_v2i16_reglo_vreg_zexti8(i8 addrspace(5)* byval %in, i16 %reg) #0 { 579entry: 580 %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095 581 %load = load i8, i8 addrspace(5)* %gep 582 %ext = zext i8 %load to i16 583 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 584 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 585 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 586 ret void 587} 588 589; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_zexti8: 590; GCN: s_waitcnt 591; GFX900: buffer_load_ubyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}} 592; GFX900-NEXT: s_waitcnt 593; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 594; GFX900-NEXT: s_waitcnt 595; GFX900-NEXT: s_setpc_b64 596 597; NO-D16-HI: buffer_load_ubyte v{{[0-9]+}}, off, s[0:3], s32 offset:4095{{$}} 598define void @load_private_hi_v2f16_reglo_vreg_zexti8(i8 addrspace(5)* byval %in, half %reg) #0 { 599entry: 600 %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095 601 %load = load i8, i8 addrspace(5)* %gep 602 %ext = zext i8 %load to i16 603 %bitcast = bitcast i16 %ext to half 604 %build0 = insertelement <2 x half> undef, half %reg, i32 0 605 %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1 606 store <2 x half> %build1, <2 x half> addrspace(1)* undef 607 ret void 608} 609 610; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_sexti8: 611; GCN: s_waitcnt 612; GFX900: buffer_load_sbyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}} 613; GFX900-NEXT: s_waitcnt 614; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 615; GFX900-NEXT: s_waitcnt 616; GFX900-NEXT: s_setpc_b64 617 618; NO-D16-HI: buffer_load_sbyte v{{[0-9]+}}, off, s[0:3], s32 offset:4095{{$}} 619define void @load_private_hi_v2f16_reglo_vreg_sexti8(i8 addrspace(5)* byval %in, half %reg) #0 { 620entry: 621 %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095 622 %load = load i8, i8 addrspace(5)* %gep 623 %ext = sext i8 %load to i16 624 %bitcast = bitcast i16 %ext to half 625 %build0 = insertelement <2 x half> undef, half %reg, i32 0 626 %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1 627 store <2 x half> %build1, <2 x half> addrspace(1)* undef 628 ret void 629} 630 631; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_sexti8: 632; GCN: s_waitcnt 633; GFX900: buffer_load_sbyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}} 634; GFX900-NEXT: s_waitcnt 635; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 636; GFX900-NEXT: s_waitcnt 637; GFX900-NEXT: s_setpc_b64 638 639; NO-D16-HI: buffer_load_sbyte v{{[0-9]+}}, off, s[0:3], s32 offset:4095{{$}} 640define void @load_private_hi_v2i16_reglo_vreg_sexti8(i8 addrspace(5)* byval %in, i16 %reg) #0 { 641entry: 642 %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095 643 %load = load i8, i8 addrspace(5)* %gep 644 %ext = sext i8 %load to i16 645 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 646 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 647 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 648 ret void 649} 650 651; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff_zexti8: 652; GCN: s_waitcnt 653; GFX900-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], s33 offset:4094{{$}} 654; GFX900-NEXT: s_waitcnt 655; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 656; GFX900-NEXT: s_waitcnt 657; GFX900-NEXT: s_setpc_b64 658 659; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], s33 offset:4094{{$}} 660define void @load_private_hi_v2i16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, i16 %reg) #0 { 661entry: 662 %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*) 663 %ext = zext i8 %load to i16 664 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 665 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 666 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 667 ret void 668} 669 670; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff_sexti8: 671; GCN: s_waitcnt 672; GFX900-NEXT: buffer_load_sbyte_d16_hi v1, off, s[0:3], s33 offset:4094{{$}} 673; GFX900-NEXT: s_waitcnt 674; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 675; GFX900-NEXT: s_waitcnt 676; GFX900-NEXT: s_setpc_b64 677 678; NO-D16-HI: buffer_load_sbyte v0, off, s[0:3], s33 offset:4094{{$}} 679define void @load_private_hi_v2i16_reglo_vreg_nooff_sexti8(i8 addrspace(5)* %in, i16 %reg) #0 { 680entry: 681 %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*) 682 %ext = sext i8 %load to i16 683 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 684 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 685 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 686 ret void 687} 688 689; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_nooff_zexti8: 690; GCN: s_waitcnt 691; GFX900-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], s33 offset:4094{{$}} 692; GFX900-NEXT: s_waitcnt 693; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 694; GFX900-NEXT: s_waitcnt 695; GFX900-NEXT: s_setpc_b64 696 697; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], s33 offset:4094{{$}} 698define void @load_private_hi_v2f16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, half %reg) #0 { 699entry: 700 %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*) 701 %ext = zext i8 %load to i16 702 %bc.ext = bitcast i16 %ext to half 703 %build0 = insertelement <2 x half> undef, half %reg, i32 0 704 %build1 = insertelement <2 x half> %build0, half %bc.ext, i32 1 705 store <2 x half> %build1, <2 x half> addrspace(1)* undef 706 ret void 707} 708 709; GCN-LABEL: {{^}}load_constant_hi_v2i16_reglo_vreg: 710; GCN: s_waitcnt 711; GFX900-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094 712; GFX900-NEXT: s_waitcnt 713; GFX900-NEXT: global_store_dword 714; GFX900-NEXT: s_waitcnt 715; GFX900-NEXT: s_setpc_b64 716 717; GFX803: flat_load_ushort 718; GFX906: global_load_ushort 719define void @load_constant_hi_v2i16_reglo_vreg(i16 addrspace(4)* %in, i16 %reg) #0 { 720entry: 721 %gep = getelementptr inbounds i16, i16 addrspace(4)* %in, i64 -2047 722 %load = load i16, i16 addrspace(4)* %gep 723 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 724 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1 725 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 726 ret void 727} 728 729; GCN-LABEL: load_constant_hi_v2f16_reglo_vreg 730; GCN: s_waitcnt 731; GFX900-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094 732; GFX900-NEXT: s_waitcnt 733; GFX900-NEXT: global_store_dword 734; GFX900-NEXT: s_waitcnt 735; GFX900-NEXT: s_setpc_b64 736 737; GFX803: flat_load_ushort 738; GFX906: global_load_ushort 739define void @load_constant_hi_v2f16_reglo_vreg(half addrspace(4)* %in, half %reg) #0 { 740entry: 741 %gep = getelementptr inbounds half, half addrspace(4)* %in, i64 -2047 742 %load = load half, half addrspace(4)* %gep 743 %build0 = insertelement <2 x half> undef, half %reg, i32 0 744 %build1 = insertelement <2 x half> %build0, half %load, i32 1 745 store <2 x half> %build1, <2 x half> addrspace(1)* undef 746 ret void 747} 748 749; GCN-LABEL: {{^}}load_constant_hi_v2f16_reglo_vreg_sexti8: 750; GCN: s_waitcnt 751; GFX900-NEXT: global_load_sbyte_d16_hi v2, v[0:1], off offset:-4095 752; GFX900-NEXT: s_waitcnt 753; GFX900-NEXT: global_store_dword 754; GFX900-NEXT: s_waitcnt 755; GFX900-NEXT: s_setpc_b64 756define void @load_constant_hi_v2f16_reglo_vreg_sexti8(i8 addrspace(4)* %in, half %reg) #0 { 757entry: 758 %gep = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 -4095 759 %load = load i8, i8 addrspace(4)* %gep 760 %ext = sext i8 %load to i16 761 %bitcast = bitcast i16 %ext to half 762 %build0 = insertelement <2 x half> undef, half %reg, i32 0 763 %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1 764 store <2 x half> %build1, <2 x half> addrspace(1)* undef 765 ret void 766} 767 768; GCN-LABEL: {{^}}load_constant_hi_v2f16_reglo_vreg_zexti8: 769; GCN: s_waitcnt 770; GFX900-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:-4095 771; GFX900-NEXT: s_waitcnt 772; GFX900-NEXT: global_store_dword 773; GFX900-NEXT: s_waitcnt 774; GFX900-NEXT: s_setpc_b64 775define void @load_constant_hi_v2f16_reglo_vreg_zexti8(i8 addrspace(4)* %in, half %reg) #0 { 776entry: 777 %gep = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 -4095 778 %load = load i8, i8 addrspace(4)* %gep 779 %ext = zext i8 %load to i16 780 %bitcast = bitcast i16 %ext to half 781 %build0 = insertelement <2 x half> undef, half %reg, i32 0 782 %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1 783 store <2 x half> %build1, <2 x half> addrspace(1)* undef 784 ret void 785} 786 787; Local object gives known offset, so requires converting from offen 788; to offset variant. 789 790; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_to_offset: 791; GFX900: buffer_store_dword 792; GFX900-NEXT: buffer_load_short_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4094 793define void @load_private_hi_v2i16_reglo_vreg_to_offset(i16 %reg) #0 { 794entry: 795 %obj0 = alloca [10 x i32], align 4, addrspace(5) 796 %obj1 = alloca [4096 x i16], align 2, addrspace(5) 797 %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)* 798 store volatile i32 123, i32 addrspace(5)* %bc 799 %gep = getelementptr inbounds [4096 x i16], [4096 x i16] addrspace(5)* %obj1, i32 0, i32 2027 800 %load = load i16, i16 addrspace(5)* %gep 801 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 802 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1 803 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 804 ret void 805} 806 807; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_sexti8_to_offset: 808; GFX900: buffer_store_dword 809; GFX900-NEXT: buffer_load_sbyte_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4095 810define void @load_private_hi_v2i16_reglo_vreg_sexti8_to_offset(i16 %reg) #0 { 811entry: 812 %obj0 = alloca [10 x i32], align 4, addrspace(5) 813 %obj1 = alloca [4096 x i8], align 2, addrspace(5) 814 %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)* 815 store volatile i32 123, i32 addrspace(5)* %bc 816 %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4055 817 %load = load i8, i8 addrspace(5)* %gep 818 %ext = sext i8 %load to i16 819 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 820 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 821 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 822 ret void 823} 824 825; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_zexti8_to_offset: 826; GFX900: buffer_store_dword 827; GFX900-NEXT: buffer_load_ubyte_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4095 828define void @load_private_hi_v2i16_reglo_vreg_zexti8_to_offset(i16 %reg) #0 { 829entry: 830 %obj0 = alloca [10 x i32], align 4, addrspace(5) 831 %obj1 = alloca [4096 x i8], align 2, addrspace(5) 832 %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)* 833 store volatile i32 123, i32 addrspace(5)* %bc 834 %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4055 835 %load = load i8, i8 addrspace(5)* %gep 836 %ext = zext i8 %load to i16 837 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 838 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 839 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 840 ret void 841} 842 843; FIXME: Remove m0 init and waitcnt between reads 844; FIXME: Is there a cost to using the extload over not? 845; GCN-LABEL: {{^}}load_local_v2i16_split_multi_chain: 846; GCN: s_waitcnt 847; GFX900-NEXT: ds_read_u16 v1, v0 848; GFX900-NEXT: s_waitcnt 849; GFX900-NEXT: ds_read_u16_d16_hi v1, v0 offset:2 850; GFX900-NEXT: s_waitcnt 851; GFX900-NEXT: v_mov_b32_e32 v0, v1 852; GFX900-NEXT: s_setpc_b64 853define <2 x i16> @load_local_v2i16_split_multi_chain(i16 addrspace(3)* %in) #0 { 854entry: 855 %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 1 856 %load0 = load volatile i16, i16 addrspace(3)* %in 857 %load1 = load volatile i16, i16 addrspace(3)* %gep 858 %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0 859 %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1 860 ret <2 x i16> %build1 861} 862 863; GCN-LABEL: {{^}}load_local_lo_hi_v2i16_samechain: 864; GFX900: ds_read_u16 v1, v0 865; GFX900-NEXT: s_waitcnt lgkmcnt(0) 866; GFX900-NEXT: ds_read_u16_d16_hi v1, v0 offset:16 867; GFX900-NEXT: s_waitcnt lgkmcnt(0) 868; GFX900-NEXT: v_mov_b32_e32 v0, v1 869; GFX900-NEXT: s_setpc_b64 870 871; NO-D16-HI: ds_read_u16 872; NO-D16-HI: ds_read_u16 873define <2 x i16> @load_local_lo_hi_v2i16_samechain(i16 addrspace(3)* %in) #0 { 874entry: 875 %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 8 876 %load.lo = load i16, i16 addrspace(3)* %in 877 %load.hi = load i16, i16 addrspace(3)* %gep 878 %build0 = insertelement <2 x i16> undef, i16 %load.lo, i32 0 879 %build1 = insertelement <2 x i16> %build0, i16 %load.hi, i32 1 880 ret <2 x i16> %build1 881} 882 883; FIXME: Remove and 884; GCN-LABEL: {{^}}load_local_v2i16_broadcast: 885; GCN: ds_read_u16 [[LOAD:v[0-9]+]] 886; GCN-NOT: ds_read 887; GFX9: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, [[LOAD]] 888; GFX9: v_lshl_or_b32 v0, [[LOAD]], 16, [[AND]] 889define <2 x i16> @load_local_v2i16_broadcast(i16 addrspace(3)* %in) #0 { 890entry: 891 %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 1 892 %load0 = load i16, i16 addrspace(3)* %in 893 %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0 894 %build1 = insertelement <2 x i16> %build0, i16 %load0, i32 1 895 ret <2 x i16> %build1 896} 897 898; GCN-LABEL: {{^}}load_local_lo_hi_v2i16_side_effect: 899; GFX900: ds_read_u16 [[LOAD0:v[0-9]+]], v0 900; GFX900: ds_write_b16 901; GFX900: ds_read_u16_d16_hi [[LOAD0]], v0 offset:16 902 903; NO-D16-HI: ds_read_u16 904; NO-D16-HI: ds_write_b16 905; NO-D16-HI: ds_read_u16 906define <2 x i16> @load_local_lo_hi_v2i16_side_effect(i16 addrspace(3)* %in, i16 addrspace(3)* %may.alias) #0 { 907entry: 908 %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 8 909 %load.lo = load i16, i16 addrspace(3)* %in 910 store i16 123, i16 addrspace(3)* %may.alias 911 %load.hi = load i16, i16 addrspace(3)* %gep 912 %build0 = insertelement <2 x i16> undef, i16 %load.lo, i32 0 913 %build1 = insertelement <2 x i16> %build0, i16 %load.hi, i32 1 914 ret <2 x i16> %build1 915} 916 917; FIXME: Remove waitcnt between reads 918; GCN-LABEL: {{^}}load_global_v2i16_split: 919; GCN: s_waitcnt 920; GFX900-NEXT: global_load_ushort v2 921; GFX900-NEXT: s_waitcnt 922; GFX900-NEXT: global_load_short_d16_hi v2 923; GFX900-NEXT: s_waitcnt 924; GFX900-NEXT: v_mov_b32_e32 v0, v2 925; GFX900-NEXT: s_setpc_b64 926define <2 x i16> @load_global_v2i16_split(i16 addrspace(1)* %in) #0 { 927entry: 928 %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 1 929 %load0 = load volatile i16, i16 addrspace(1)* %in 930 %load1 = load volatile i16, i16 addrspace(1)* %gep 931 %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0 932 %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1 933 ret <2 x i16> %build1 934} 935 936; FIXME: Remove waitcnt between reads 937; GCN-LABEL: {{^}}load_flat_v2i16_split: 938; GCN: s_waitcnt 939; GFX900-NEXT: flat_load_ushort v2 940; GFX900-NEXT: s_waitcnt 941; GFX900-NEXT: flat_load_short_d16_hi v2 942; GFX900-NEXT: s_waitcnt 943; GFX900-NEXT: v_mov_b32_e32 v0, v2 944; GFX900-NEXT: s_setpc_b64 945define <2 x i16> @load_flat_v2i16_split(i16* %in) #0 { 946entry: 947 %gep = getelementptr inbounds i16, i16* %in, i64 1 948 %load0 = load volatile i16, i16* %in 949 %load1 = load volatile i16, i16* %gep 950 %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0 951 %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1 952 ret <2 x i16> %build1 953} 954 955; FIXME: Remove waitcnt between reads 956; GCN-LABEL: {{^}}load_constant_v2i16_split: 957; GCN: s_waitcnt 958; GFX900-NEXT: global_load_ushort v2 959; GFX900-NEXT: s_waitcnt 960; GFX900-NEXT: global_load_short_d16_hi v2 961; GFX900-NEXT: s_waitcnt 962; GFX900-NEXT: v_mov_b32_e32 v0, v2 963; GFX900-NEXT: s_setpc_b64 964define <2 x i16> @load_constant_v2i16_split(i16 addrspace(4)* %in) #0 { 965entry: 966 %gep = getelementptr inbounds i16, i16 addrspace(4)* %in, i64 1 967 %load0 = load volatile i16, i16 addrspace(4)* %in 968 %load1 = load volatile i16, i16 addrspace(4)* %gep 969 %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0 970 %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1 971 ret <2 x i16> %build1 972} 973 974; FIXME: Remove m0 init and waitcnt between reads 975; FIXME: Is there a cost to using the extload over not? 976; GCN-LABEL: {{^}}load_private_v2i16_split: 977; GCN: s_waitcnt 978; GFX900: buffer_load_ushort v0, off, s[0:3], s32{{$}} 979; GFX900-NEXT: s_waitcnt 980; GFX900-NEXT: buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:2 981; GFX900-NEXT: s_waitcnt 982; GFX900-NEXT: s_setpc_b64 983define <2 x i16> @load_private_v2i16_split(i16 addrspace(5)* byval %in) #0 { 984entry: 985 %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i32 1 986 %load0 = load volatile i16, i16 addrspace(5)* %in 987 %load1 = load volatile i16, i16 addrspace(5)* %gep 988 %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0 989 %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1 990 ret <2 x i16> %build1 991} 992 993attributes #0 = { nounwind } 994