1; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900 %s 2; RUN: llc -march=amdgcn -mcpu=gfx906 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX906,NO-D16-HI %s 3; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX803,NO-D16-HI %s 4 5; GCN-LABEL: {{^}}load_local_lo_hi_v2i16_multi_use_lo: 6; GFX900: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7; GFX900-NEXT: ds_read_u16 v2, v0 8; GFX900-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 9; GFX900-DAG: s_waitcnt lgkmcnt(0) 10; GFX900-DAG: v_mov_b32_e32 v1, v2 11; GFX900-DAG: ds_read_u16_d16_hi v1, v0 offset:16 12; GFX900: ds_write_b16 [[ZERO]], v2 13; GFX900-NEXT: s_waitcnt lgkmcnt(1) 14; GFX900-NEXT: v_mov_b32_e32 v0, v1 15; GFX900-NEXT: s_waitcnt lgkmcnt(0) 16; GFX900-NEXT: s_setpc_b64 s[30:31] 17define <2 x i16> @load_local_lo_hi_v2i16_multi_use_lo(i16 addrspace(3)* noalias %in) #0 { 18entry: 19 %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 8 20 %load.lo = load i16, i16 addrspace(3)* %in 21 %load.hi = load i16, i16 addrspace(3)* %gep 22 store i16 %load.lo, i16 addrspace(3)* null 23 %build0 = insertelement <2 x i16> undef, i16 %load.lo, i32 0 24 %build1 = insertelement <2 x i16> %build0, i16 %load.hi, i32 1 25 ret <2 x i16> %build1 26} 27 28; GCN-LABEL: {{^}}load_local_lo_hi_v2i16_multi_use_hi: 29; GFX900: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 30; GFX900-DAG: ds_read_u16 [[LO:v[0-9]+]], v0 31; GFX900-DAG: ds_read_u16 [[HI:v[0-9]+]], v0 offset:16 32; GFX900-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 33; GFX900-DAG: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, [[LO]] 34; GFX900-DAG: s_waitcnt lgkmcnt(0) 35; GFX900-DAG: ds_write_b16 [[ZERO]], [[HI]] 36; GFX900: v_lshl_or_b32 [[HI]], [[HI]], 16, [[AND]] 37; GFX900-NEXT: s_waitcnt lgkmcnt(0) 38; GFX900-NEXT: s_setpc_b64 s[30:31] 39define <2 x i16> @load_local_lo_hi_v2i16_multi_use_hi(i16 addrspace(3)* noalias %in) #0 { 40entry: 41 %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 8 42 %load.lo = load i16, i16 addrspace(3)* %in 43 %load.hi = load i16, i16 addrspace(3)* %gep 44 store i16 %load.hi, i16 addrspace(3)* null 45 %build0 = insertelement <2 x i16> undef, i16 %load.lo, i32 0 46 %build1 = insertelement <2 x i16> %build0, i16 %load.hi, i32 1 47 ret <2 x i16> %build1 48} 49 50; GCN-LABEL: {{^}}load_local_lo_hi_v2i16_multi_use_lohi: 51; GFX900: ds_read_u16 v3, v0 52; GFX900-NEXT: ds_read_u16 v0, v0 offset:16 53; GFX900-NEXT: s_waitcnt lgkmcnt(1) 54; GFX900-NEXT: ds_write_b16 v1, v3 55; GFX900-NEXT: s_waitcnt lgkmcnt(1) 56; GFX900-NEXT: ds_write_b16 v2, v0 57; GFX900-NEXT: v_and_b32_e32 v1, 0xffff, v3 58; GFX900-NEXT: v_lshl_or_b32 v0, v0, 16, v1 59; GFX900-NEXT: s_waitcnt lgkmcnt(0) 60; GFX900-NEXT: s_setpc_b64 s[30:31] 61define <2 x i16> @load_local_lo_hi_v2i16_multi_use_lohi(i16 addrspace(3)* noalias %in, i16 addrspace(3)* noalias %out0, i16 addrspace(3)* noalias %out1) #0 { 62entry: 63 %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 8 64 %load.lo = load i16, i16 addrspace(3)* %in 65 %load.hi = load i16, i16 addrspace(3)* %gep 66 store i16 %load.lo, i16 addrspace(3)* %out0 67 store i16 %load.hi, i16 addrspace(3)* %out1 68 %build0 = insertelement <2 x i16> undef, i16 %load.lo, i32 0 69 %build1 = insertelement <2 x i16> %build0, i16 %load.hi, i32 1 70 ret <2 x i16> %build1 71} 72 73; GCN-LABEL: {{^}}load_local_hi_v2i16_undeflo: 74; GCN: s_waitcnt 75; GFX900-NEXT: ds_read_u16_d16_hi v0, v0 76; GFX900-NEXT: s_waitcnt 77; GFX900-NEXT: s_setpc_b64 78 79; NO-D16-HI: ds_read_u16 v 80define <2 x i16> @load_local_hi_v2i16_undeflo(i16 addrspace(3)* %in) #0 { 81entry: 82 %load = load i16, i16 addrspace(3)* %in 83 %build = insertelement <2 x i16> undef, i16 %load, i32 1 84 ret <2 x i16> %build 85} 86 87; GCN-LABEL: {{^}}load_local_hi_v2i16_reglo: 88; GCN: s_waitcnt 89; GFX900-NEXT: ds_read_u16_d16_hi v1, v0 90; GFX900-NEXT: s_waitcnt 91; GFX900-NEXT: v_mov_b32_e32 v0, v1 92; GFX900-NEXT: s_setpc_b64 93 94; NO-D16-HI: ds_read_u16 v 95define <2 x i16> @load_local_hi_v2i16_reglo(i16 addrspace(3)* %in, i16 %reg) #0 { 96entry: 97 %load = load i16, i16 addrspace(3)* %in 98 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 99 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1 100 ret <2 x i16> %build1 101} 102 103; Show that we get reasonable regalloc without physreg constraints. 104; GCN-LABEL: {{^}}load_local_hi_v2i16_reglo_vreg: 105; GCN: s_waitcnt 106; GFX900-NEXT: ds_read_u16_d16_hi v1, v0 107; GFX900-NEXT: s_waitcnt 108; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}} 109; GFX900-NEXT: s_waitcnt 110; GFX900-NEXT: s_setpc_b64 111 112; NO-D16-HI: ds_read_u16 v 113define void @load_local_hi_v2i16_reglo_vreg(i16 addrspace(3)* %in, i16 %reg) #0 { 114entry: 115 %load = load i16, i16 addrspace(3)* %in 116 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 117 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1 118 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 119 ret void 120} 121 122; GCN-LABEL: {{^}}load_local_hi_v2i16_zerolo: 123; GCN: s_waitcnt 124; GFX900-NEXT: v_mov_b32_e32 v1, 0 125; GFX900-NEXT: ds_read_u16_d16_hi v1, v0 126; GFX900-NEXT: s_waitcnt 127; GFX900-NEXT: v_mov_b32_e32 v0, v1 128; GFX900-NEXT: s_setpc_b64 129 130; NO-D16-HI: ds_read_u16 v 131define <2 x i16> @load_local_hi_v2i16_zerolo(i16 addrspace(3)* %in) #0 { 132entry: 133 %load = load i16, i16 addrspace(3)* %in 134 %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 1 135 ret <2 x i16> %build 136} 137 138; FIXME: Remove m0 initialization 139; GCN-LABEL: {{^}}load_local_hi_v2i16_zerolo_shift: 140; GCN: s_waitcnt 141; GFX900-NEXT: ds_read_u16 v0, v0 142; GFX900-NEXT: s_waitcnt lgkmcnt(0) 143; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 144; GFX900-NEXT: s_setpc_b64 145 146; NO-D16-HI: ds_read_u16 v 147; NO-D16-HI: v_lshlrev_b32_e32 v0, 16, v0 148define i32 @load_local_hi_v2i16_zerolo_shift(i16 addrspace(3)* %in) #0 { 149entry: 150 %load = load i16, i16 addrspace(3)* %in 151 %zext = zext i16 %load to i32 152 %shift = shl i32 %zext, 16 153 ret i32 %shift 154} 155 156; GCN-LABEL: {{^}}load_local_hi_v2f16_reglo_vreg: 157; GCN: s_waitcnt 158; GFX900-NEXT: ds_read_u16_d16_hi v1, v0 159; GFX900-NEXT: s_waitcnt 160; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}} 161; GFX900-NEXT: s_waitcnt 162; GFX900-NEXT: s_setpc_b64 163 164; NO-D16-HI: ds_read_u16 v 165define void @load_local_hi_v2f16_reglo_vreg(half addrspace(3)* %in, half %reg) #0 { 166entry: 167 %load = load half, half addrspace(3)* %in 168 %build0 = insertelement <2 x half> undef, half %reg, i32 0 169 %build1 = insertelement <2 x half> %build0, half %load, i32 1 170 store <2 x half> %build1, <2 x half> addrspace(1)* undef 171 ret void 172} 173 174; GCN-LABEL: {{^}}load_local_hi_v2i16_reglo_vreg_zexti8: 175; GCN: s_waitcnt 176; GFX900-NEXT: ds_read_u8_d16_hi v1, v0 177; GFX900-NEXT: s_waitcnt 178; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}} 179; GFX900-NEXT: s_waitcnt 180; GFX900-NEXT: s_setpc_b64 181 182; NO-D16-HI: ds_read_u8 v 183define void @load_local_hi_v2i16_reglo_vreg_zexti8(i8 addrspace(3)* %in, i16 %reg) #0 { 184entry: 185 %load = load i8, i8 addrspace(3)* %in 186 %ext = zext i8 %load to i16 187 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 188 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 189 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 190 ret void 191} 192 193; GCN-LABEL: {{^}}load_local_hi_v2i16_reglo_vreg_sexti8: 194; GCN: s_waitcnt 195; GFX900-NEXT: ds_read_i8_d16_hi v1, v0 196; GFX900-NEXT: s_waitcnt 197; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}} 198; GFX900-NEXT: s_waitcnt 199; GFX900-NEXT: s_setpc_b64 200 201; NO-D16-HI: ds_read_i8 v 202define void @load_local_hi_v2i16_reglo_vreg_sexti8(i8 addrspace(3)* %in, i16 %reg) #0 { 203entry: 204 %load = load i8, i8 addrspace(3)* %in 205 %ext = sext i8 %load to i16 206 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 207 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 208 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 209 ret void 210} 211 212; GCN-LABEL: {{^}}load_local_hi_v2f16_reglo_vreg_zexti8: 213; GCN: s_waitcnt 214; GFX900-NEXT: ds_read_u8_d16_hi v1, v0 215; GFX900-NEXT: s_waitcnt 216; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}} 217; GFX900-NEXT: s_waitcnt 218; GFX900-NEXT: s_setpc_b64 219 220; NO-D16-HI: ds_read_u8 v 221define void @load_local_hi_v2f16_reglo_vreg_zexti8(i8 addrspace(3)* %in, half %reg) #0 { 222entry: 223 %load = load i8, i8 addrspace(3)* %in 224 %ext = zext i8 %load to i16 225 %bitcast = bitcast i16 %ext to half 226 227 %build0 = insertelement <2 x half> undef, half %reg, i32 0 228 %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1 229 store <2 x half> %build1, <2 x half> addrspace(1)* undef 230 ret void 231} 232 233; GCN-LABEL: {{^}}load_local_hi_v2f16_reglo_vreg_sexti8: 234; GCN: s_waitcnt 235; GFX900-NEXT: ds_read_i8_d16_hi v1, v0 236; GFX900-NEXT: s_waitcnt 237; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}} 238; GFX900-NEXT: s_waitcnt 239; GFX900-NEXT: s_setpc_b64 240 241; NO-D16-HI: ds_read_i8 v 242define void @load_local_hi_v2f16_reglo_vreg_sexti8(i8 addrspace(3)* %in, half %reg) #0 { 243entry: 244 %load = load i8, i8 addrspace(3)* %in 245 %ext = sext i8 %load to i16 246 %bitcast = bitcast i16 %ext to half 247 248 %build0 = insertelement <2 x half> undef, half %reg, i32 0 249 %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1 250 store <2 x half> %build1, <2 x half> addrspace(1)* undef 251 ret void 252} 253 254; GCN-LABEL: {{^}}load_global_hi_v2i16_reglo_vreg: 255; GCN: s_waitcnt 256; GFX900-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094 257; GFX900-NEXT: s_waitcnt 258; GFX900-NEXT: global_store_dword 259; GFX900-NEXT: s_waitcnt 260; GFX900-NEXT: s_setpc_b64 261define void @load_global_hi_v2i16_reglo_vreg(i16 addrspace(1)* %in, i16 %reg) #0 { 262entry: 263 %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 -2047 264 %load = load i16, i16 addrspace(1)* %gep 265 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 266 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1 267 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 268 ret void 269} 270 271; GCN-LABEL: {{^}}load_global_hi_v2f16_reglo_vreg: 272; GCN: s_waitcnt 273; GFX900-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094 274; GFX900-NEXT: s_waitcnt 275; GFX900-NEXT: global_store_dword 276; GFX900-NEXT: s_waitcnt 277; GFX900-NEXT: s_setpc_b64 278define void @load_global_hi_v2f16_reglo_vreg(half addrspace(1)* %in, half %reg) #0 { 279entry: 280 %gep = getelementptr inbounds half, half addrspace(1)* %in, i64 -2047 281 %load = load half, half addrspace(1)* %gep 282 %build0 = insertelement <2 x half> undef, half %reg, i32 0 283 %build1 = insertelement <2 x half> %build0, half %load, i32 1 284 store <2 x half> %build1, <2 x half> addrspace(1)* undef 285 ret void 286} 287 288; GCN-LABEL: {{^}}load_global_hi_v2i16_reglo_vreg_zexti8: 289; GCN: s_waitcnt 290; GFX900-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:-4095 291; GFX900-NEXT: s_waitcnt 292; GFX900-NEXT: global_store_dword 293; GFX900-NEXT: s_waitcnt 294; GFX900-NEXT: s_setpc_b64 295define void @load_global_hi_v2i16_reglo_vreg_zexti8(i8 addrspace(1)* %in, i16 %reg) #0 { 296entry: 297 %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095 298 %load = load i8, i8 addrspace(1)* %gep 299 %ext = zext i8 %load to i16 300 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 301 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 302 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 303 ret void 304} 305 306; GCN-LABEL: {{^}}load_global_hi_v2i16_reglo_vreg_sexti8: 307; GCN: s_waitcnt 308; GFX900-NEXT: global_load_sbyte_d16_hi v2, v[0:1], off offset:-4095 309; GFX900-NEXT: s_waitcnt 310; GFX900-NEXT: global_store_dword 311; GFX900-NEXT: s_waitcnt 312; GFX900-NEXT: s_setpc_b64 313define void @load_global_hi_v2i16_reglo_vreg_sexti8(i8 addrspace(1)* %in, i16 %reg) #0 { 314entry: 315 %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095 316 %load = load i8, i8 addrspace(1)* %gep 317 %ext = sext i8 %load to i16 318 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 319 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 320 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 321 ret void 322} 323 324; GCN-LABEL: {{^}}load_global_hi_v2f16_reglo_vreg_sexti8: 325; GCN: s_waitcnt 326; GFX900-NEXT: global_load_sbyte_d16_hi v2, v[0:1], off offset:-4095 327; GFX900-NEXT: s_waitcnt 328; GFX900-NEXT: global_store_dword 329; GFX900-NEXT: s_waitcnt 330; GFX900-NEXT: s_setpc_b64 331define void @load_global_hi_v2f16_reglo_vreg_sexti8(i8 addrspace(1)* %in, half %reg) #0 { 332entry: 333 %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095 334 %load = load i8, i8 addrspace(1)* %gep 335 %ext = sext i8 %load to i16 336 %bitcast = bitcast i16 %ext to half 337 %build0 = insertelement <2 x half> undef, half %reg, i32 0 338 %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1 339 store <2 x half> %build1, <2 x half> addrspace(1)* undef 340 ret void 341} 342 343; GCN-LABEL: {{^}}load_global_hi_v2f16_reglo_vreg_zexti8: 344; GCN: s_waitcnt 345; GFX900-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:-4095 346; GFX900-NEXT: s_waitcnt 347; GFX900-NEXT: global_store_dword 348; GFX900-NEXT: s_waitcnt 349; GFX900-NEXT: s_setpc_b64 350define void @load_global_hi_v2f16_reglo_vreg_zexti8(i8 addrspace(1)* %in, half %reg) #0 { 351entry: 352 %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095 353 %load = load i8, i8 addrspace(1)* %gep 354 %ext = zext i8 %load to i16 355 %bitcast = bitcast i16 %ext to half 356 %build0 = insertelement <2 x half> undef, half %reg, i32 0 357 %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1 358 store <2 x half> %build1, <2 x half> addrspace(1)* undef 359 ret void 360} 361 362; GCN-LABEL: load_flat_hi_v2i16_reglo_vreg: 363; GCN: s_waitcnt 364; GFX900-NEXT: flat_load_short_d16_hi v2, v[0:1] 365; GFX900-NEXT: s_waitcnt 366; GFX900-NEXT: global_store_dword v[0:1], v2 367; GFX900-NEXT: s_waitcnt 368; GFX900-NEXT: s_setpc_b64 369 370; NO-D16-HI: flat_load_ushort v{{[0-9]+}} 371; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, 372; GFX803: v_or_b32_sdwa 373; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16, 374define void @load_flat_hi_v2i16_reglo_vreg(i16* %in, i16 %reg) #0 { 375entry: 376 %load = load i16, i16* %in 377 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 378 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1 379 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 380 ret void 381} 382 383; GCN-LABEL: {{^}}load_flat_hi_v2f16_reglo_vreg: 384; GCN: s_waitcnt 385; GFX900-NEXT: flat_load_short_d16_hi v2, v[0:1] 386; GFX900-NEXT: s_waitcnt 387; GFX900-NEXT: global_store_dword v[0:1], v2 388; GFX900-NEXT: s_waitcnt 389; GFX900-NEXT: s_setpc_b64 390 391; NO-D16-HI: flat_load_ushort v{{[0-9]+}} 392; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, 393; GFX803: v_or_b32_sdwa 394; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16, 395define void @load_flat_hi_v2f16_reglo_vreg(half* %in, half %reg) #0 { 396entry: 397 %load = load half, half* %in 398 %build0 = insertelement <2 x half> undef, half %reg, i32 0 399 %build1 = insertelement <2 x half> %build0, half %load, i32 1 400 store <2 x half> %build1, <2 x half> addrspace(1)* undef 401 ret void 402} 403 404; GCN-LABEL: {{^}}load_flat_hi_v2i16_reglo_vreg_zexti8: 405; GCN: s_waitcnt 406; GFX900-NEXT: flat_load_ubyte_d16_hi v2, v[0:1] 407; GFX900-NEXT: s_waitcnt 408; GFX900-NEXT: global_store_dword v[0:1], v2 409; GFX900-NEXT: s_waitcnt 410; GFX900-NEXT: s_setpc_b64 411 412; NO-D16-HI: flat_load_ubyte v{{[0-9]+}} 413; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, 414; GFX803: v_or_b32_sdwa 415; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16, 416define void @load_flat_hi_v2i16_reglo_vreg_zexti8(i8* %in, i16 %reg) #0 { 417entry: 418 %load = load i8, i8* %in 419 %ext = zext i8 %load to i16 420 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 421 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 422 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 423 ret void 424} 425 426; GCN-LABEL: {{^}}load_flat_hi_v2i16_reglo_vreg_sexti8: 427; GCN: s_waitcnt 428; GFX900-NEXT: flat_load_sbyte_d16_hi v2, v[0:1] 429; GFX900-NEXT: s_waitcnt 430; GFX900-NEXT: global_store_dword v[0:1], v2 431; GFX900-NEXT: s_waitcnt 432; GFX900-NEXT: s_setpc_b64 433 434; NO-D16-HI: flat_load_sbyte v{{[0-9]+}} 435; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, 436; GFX803: v_or_b32_sdwa 437; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16, 438define void @load_flat_hi_v2i16_reglo_vreg_sexti8(i8* %in, i16 %reg) #0 { 439entry: 440 %load = load i8, i8* %in 441 %ext = sext i8 %load to i16 442 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 443 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 444 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 445 ret void 446} 447 448; GCN-LABEL: {{^}}load_flat_hi_v2f16_reglo_vreg_zexti8: 449; GCN: s_waitcnt 450; GFX900-NEXT: flat_load_ubyte_d16_hi v2, v[0:1] 451; GFX900-NEXT: s_waitcnt 452; GFX900-NEXT: global_store_dword v[0:1], v2 453; GFX900-NEXT: s_waitcnt 454; GFX900-NEXT: s_setpc_b64 455 456; NO-D16-HI: flat_load_ubyte v{{[0-9]+}} 457; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, 458; GFX803: v_or_b32_sdwa 459; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16, 460define void @load_flat_hi_v2f16_reglo_vreg_zexti8(i8* %in, half %reg) #0 { 461entry: 462 %load = load i8, i8* %in 463 %ext = zext i8 %load to i16 464 %bitcast = bitcast i16 %ext to half 465 %build0 = insertelement <2 x half> undef, half %reg, i32 0 466 %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1 467 store <2 x half> %build1, <2 x half> addrspace(1)* undef 468 ret void 469} 470 471; GCN-LABEL: {{^}}load_flat_hi_v2f16_reglo_vreg_sexti8: 472; GCN: s_waitcnt 473; GFX900-NEXT: flat_load_sbyte_d16_hi v2, v[0:1] 474; GFX900-NEXT: s_waitcnt 475; GFX900-NEXT: global_store_dword v[0:1], v2 476; GFX900-NEXT: s_waitcnt 477; GFX900-NEXT: s_setpc_b64 478 479; NO-D16-HI: flat_load_sbyte v{{[0-9]+}} 480; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, 481; GFX803: v_or_b32_sdwa 482; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16, 483define void @load_flat_hi_v2f16_reglo_vreg_sexti8(i8* %in, half %reg) #0 { 484entry: 485 %load = load i8, i8* %in 486 %ext = sext i8 %load to i16 487 %bitcast = bitcast i16 %ext to half 488 %build0 = insertelement <2 x half> undef, half %reg, i32 0 489 %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1 490 store <2 x half> %build1, <2 x half> addrspace(1)* undef 491 ret void 492} 493 494; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg: 495; GCN: s_waitcnt 496; GFX900: buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:4094{{$}} 497; GFX900-NEXT: s_waitcnt 498; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 499; GFX900-NEXT: s_waitcnt 500; GFX900-NEXT: s_setpc_b64 501 502; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s32 offset:4094{{$}} 503define void @load_private_hi_v2i16_reglo_vreg(i16 addrspace(5)* byval %in, i16 %reg) #0 { 504entry: 505 %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i64 2047 506 %load = load i16, i16 addrspace(5)* %gep 507 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 508 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1 509 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 510 ret void 511} 512 513; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg: 514; GCN: s_waitcnt 515; GFX900: buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:4094{{$}} 516; GFX900-NEXT: s_waitcnt 517; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 518; GFX900-NEXT: s_waitcnt 519; GFX900-NEXT: s_setpc_b64 520 521; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s32 offset:4094{{$}} 522define void @load_private_hi_v2f16_reglo_vreg(half addrspace(5)* byval %in, half %reg) #0 { 523entry: 524 %gep = getelementptr inbounds half, half addrspace(5)* %in, i64 2047 525 %load = load half, half addrspace(5)* %gep 526 %build0 = insertelement <2 x half> undef, half %reg, i32 0 527 %build1 = insertelement <2 x half> %build0, half %load, i32 1 528 store <2 x half> %build1, <2 x half> addrspace(1)* undef 529 ret void 530} 531 532; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff: 533; GCN: s_waitcnt 534; GFX900: buffer_load_short_d16_hi v0, off, s[0:3], 0 offset:4094{{$}} 535; GFX900: s_waitcnt 536; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 537; GFX900-NEXT: s_waitcnt 538; GFX900-NEXT: s_setpc_b64 539 540; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], 0 offset:4094{{$}} 541define void @load_private_hi_v2i16_reglo_vreg_nooff(i16 addrspace(5)* byval %in, i16 %reg) #0 { 542entry: 543 %load = load volatile i16, i16 addrspace(5)* inttoptr (i32 4094 to i16 addrspace(5)*) 544 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 545 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1 546 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 547 ret void 548} 549 550; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_nooff: 551; GCN: s_waitcnt 552; GFX900-NEXT: buffer_load_short_d16_hi v1, off, s[0:3], 0 offset:4094{{$}} 553; GFX900-NEXT: s_waitcnt 554; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 555; GFX900-NEXT: s_waitcnt 556; GFX900-NEXT: s_setpc_b64 557 558; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], 0 offset:4094{{$}} 559define void @load_private_hi_v2f16_reglo_vreg_nooff(half addrspace(5)* %in, half %reg) #0 { 560entry: 561 %load = load volatile half, half addrspace(5)* inttoptr (i32 4094 to half addrspace(5)*) 562 %build0 = insertelement <2 x half> undef, half %reg, i32 0 563 %build1 = insertelement <2 x half> %build0, half %load, i32 1 564 store <2 x half> %build1, <2 x half> addrspace(1)* undef 565 ret void 566} 567 568; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_zexti8: 569; GCN: s_waitcnt 570; GFX900: buffer_load_ubyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}} 571; GFX900-NEXT: s_waitcnt 572; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 573; GFX900-NEXT: s_waitcnt 574; GFX900-NEXT: s_setpc_b64 575 576; NO-D16-HI: buffer_load_ubyte v{{[0-9]+}}, off, s[0:3], s32 offset:4095{{$}} 577define void @load_private_hi_v2i16_reglo_vreg_zexti8(i8 addrspace(5)* byval %in, i16 %reg) #0 { 578entry: 579 %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095 580 %load = load i8, i8 addrspace(5)* %gep 581 %ext = zext i8 %load to i16 582 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 583 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 584 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 585 ret void 586} 587 588; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_zexti8: 589; GCN: s_waitcnt 590; GFX900: buffer_load_ubyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}} 591; GFX900-NEXT: s_waitcnt 592; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 593; GFX900-NEXT: s_waitcnt 594; GFX900-NEXT: s_setpc_b64 595 596; NO-D16-HI: buffer_load_ubyte v{{[0-9]+}}, off, s[0:3], s32 offset:4095{{$}} 597define void @load_private_hi_v2f16_reglo_vreg_zexti8(i8 addrspace(5)* byval %in, half %reg) #0 { 598entry: 599 %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095 600 %load = load i8, i8 addrspace(5)* %gep 601 %ext = zext i8 %load to i16 602 %bitcast = bitcast i16 %ext to half 603 %build0 = insertelement <2 x half> undef, half %reg, i32 0 604 %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1 605 store <2 x half> %build1, <2 x half> addrspace(1)* undef 606 ret void 607} 608 609; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_sexti8: 610; GCN: s_waitcnt 611; GFX900: buffer_load_sbyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}} 612; GFX900-NEXT: s_waitcnt 613; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 614; GFX900-NEXT: s_waitcnt 615; GFX900-NEXT: s_setpc_b64 616 617; NO-D16-HI: buffer_load_sbyte v{{[0-9]+}}, off, s[0:3], s32 offset:4095{{$}} 618define void @load_private_hi_v2f16_reglo_vreg_sexti8(i8 addrspace(5)* byval %in, half %reg) #0 { 619entry: 620 %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095 621 %load = load i8, i8 addrspace(5)* %gep 622 %ext = sext i8 %load to i16 623 %bitcast = bitcast i16 %ext to half 624 %build0 = insertelement <2 x half> undef, half %reg, i32 0 625 %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1 626 store <2 x half> %build1, <2 x half> addrspace(1)* undef 627 ret void 628} 629 630; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_sexti8: 631; GCN: s_waitcnt 632; GFX900: buffer_load_sbyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}} 633; GFX900-NEXT: s_waitcnt 634; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 635; GFX900-NEXT: s_waitcnt 636; GFX900-NEXT: s_setpc_b64 637 638; NO-D16-HI: buffer_load_sbyte v{{[0-9]+}}, off, s[0:3], s32 offset:4095{{$}} 639define void @load_private_hi_v2i16_reglo_vreg_sexti8(i8 addrspace(5)* byval %in, i16 %reg) #0 { 640entry: 641 %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095 642 %load = load i8, i8 addrspace(5)* %gep 643 %ext = sext i8 %load to i16 644 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 645 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 646 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 647 ret void 648} 649 650; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff_zexti8: 651; GCN: s_waitcnt 652; GFX900-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], 0 offset:4094{{$}} 653; GFX900-NEXT: s_waitcnt 654; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 655; GFX900-NEXT: s_waitcnt 656; GFX900-NEXT: s_setpc_b64 657 658; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094{{$}} 659define void @load_private_hi_v2i16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, i16 %reg) #0 { 660entry: 661 %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*) 662 %ext = zext i8 %load to i16 663 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 664 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 665 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 666 ret void 667} 668 669; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff_sexti8: 670; GCN: s_waitcnt 671; GFX900-NEXT: buffer_load_sbyte_d16_hi v1, off, s[0:3], 0 offset:4094{{$}} 672; GFX900-NEXT: s_waitcnt 673; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 674; GFX900-NEXT: s_waitcnt 675; GFX900-NEXT: s_setpc_b64 676 677; NO-D16-HI: buffer_load_sbyte v0, off, s[0:3], 0 offset:4094{{$}} 678define void @load_private_hi_v2i16_reglo_vreg_nooff_sexti8(i8 addrspace(5)* %in, i16 %reg) #0 { 679entry: 680 %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*) 681 %ext = sext i8 %load to i16 682 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 683 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 684 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 685 ret void 686} 687 688; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_nooff_zexti8: 689; GCN: s_waitcnt 690; GFX900-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], 0 offset:4094{{$}} 691; GFX900-NEXT: s_waitcnt 692; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 693; GFX900-NEXT: s_waitcnt 694; GFX900-NEXT: s_setpc_b64 695 696; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094{{$}} 697define void @load_private_hi_v2f16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, half %reg) #0 { 698entry: 699 %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*) 700 %ext = zext i8 %load to i16 701 %bc.ext = bitcast i16 %ext to half 702 %build0 = insertelement <2 x half> undef, half %reg, i32 0 703 %build1 = insertelement <2 x half> %build0, half %bc.ext, i32 1 704 store <2 x half> %build1, <2 x half> addrspace(1)* undef 705 ret void 706} 707 708; GCN-LABEL: {{^}}load_constant_hi_v2i16_reglo_vreg: 709; GCN: s_waitcnt 710; GFX900-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094 711; GFX900-NEXT: s_waitcnt 712; GFX900-NEXT: global_store_dword 713; GFX900-NEXT: s_waitcnt 714; GFX900-NEXT: s_setpc_b64 715 716; GFX803: flat_load_ushort 717; GFX906: global_load_ushort 718define void @load_constant_hi_v2i16_reglo_vreg(i16 addrspace(4)* %in, i16 %reg) #0 { 719entry: 720 %gep = getelementptr inbounds i16, i16 addrspace(4)* %in, i64 -2047 721 %load = load i16, i16 addrspace(4)* %gep 722 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 723 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1 724 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 725 ret void 726} 727 728; GCN-LABEL: load_constant_hi_v2f16_reglo_vreg 729; GCN: s_waitcnt 730; GFX900-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094 731; GFX900-NEXT: s_waitcnt 732; GFX900-NEXT: global_store_dword 733; GFX900-NEXT: s_waitcnt 734; GFX900-NEXT: s_setpc_b64 735 736; GFX803: flat_load_ushort 737; GFX906: global_load_ushort 738define void @load_constant_hi_v2f16_reglo_vreg(half addrspace(4)* %in, half %reg) #0 { 739entry: 740 %gep = getelementptr inbounds half, half addrspace(4)* %in, i64 -2047 741 %load = load half, half addrspace(4)* %gep 742 %build0 = insertelement <2 x half> undef, half %reg, i32 0 743 %build1 = insertelement <2 x half> %build0, half %load, i32 1 744 store <2 x half> %build1, <2 x half> addrspace(1)* undef 745 ret void 746} 747 748; GCN-LABEL: {{^}}load_constant_hi_v2f16_reglo_vreg_sexti8: 749; GCN: s_waitcnt 750; GFX900-NEXT: global_load_sbyte_d16_hi v2, v[0:1], off offset:-4095 751; GFX900-NEXT: s_waitcnt 752; GFX900-NEXT: global_store_dword 753; GFX900-NEXT: s_waitcnt 754; GFX900-NEXT: s_setpc_b64 755define void @load_constant_hi_v2f16_reglo_vreg_sexti8(i8 addrspace(4)* %in, half %reg) #0 { 756entry: 757 %gep = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 -4095 758 %load = load i8, i8 addrspace(4)* %gep 759 %ext = sext i8 %load to i16 760 %bitcast = bitcast i16 %ext to half 761 %build0 = insertelement <2 x half> undef, half %reg, i32 0 762 %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1 763 store <2 x half> %build1, <2 x half> addrspace(1)* undef 764 ret void 765} 766 767; GCN-LABEL: {{^}}load_constant_hi_v2f16_reglo_vreg_zexti8: 768; GCN: s_waitcnt 769; GFX900-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:-4095 770; GFX900-NEXT: s_waitcnt 771; GFX900-NEXT: global_store_dword 772; GFX900-NEXT: s_waitcnt 773; GFX900-NEXT: s_setpc_b64 774define void @load_constant_hi_v2f16_reglo_vreg_zexti8(i8 addrspace(4)* %in, half %reg) #0 { 775entry: 776 %gep = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 -4095 777 %load = load i8, i8 addrspace(4)* %gep 778 %ext = zext i8 %load to i16 779 %bitcast = bitcast i16 %ext to half 780 %build0 = insertelement <2 x half> undef, half %reg, i32 0 781 %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1 782 store <2 x half> %build1, <2 x half> addrspace(1)* undef 783 ret void 784} 785 786; Local object gives known offset, so requires converting from offen 787; to offset variant. 788 789; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_to_offset: 790; GFX900: buffer_store_dword 791; GFX900-NEXT: buffer_load_short_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4094 792define void @load_private_hi_v2i16_reglo_vreg_to_offset(i16 %reg) #0 { 793entry: 794 %obj0 = alloca [10 x i32], align 4, addrspace(5) 795 %obj1 = alloca [4096 x i16], align 2, addrspace(5) 796 %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)* 797 store volatile i32 123, i32 addrspace(5)* %bc 798 %gep = getelementptr inbounds [4096 x i16], [4096 x i16] addrspace(5)* %obj1, i32 0, i32 2027 799 %load = load i16, i16 addrspace(5)* %gep 800 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 801 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1 802 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 803 ret void 804} 805 806; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_sexti8_to_offset: 807; GFX900: buffer_store_dword 808; GFX900-NEXT: buffer_load_sbyte_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4095 809define void @load_private_hi_v2i16_reglo_vreg_sexti8_to_offset(i16 %reg) #0 { 810entry: 811 %obj0 = alloca [10 x i32], align 4, addrspace(5) 812 %obj1 = alloca [4096 x i8], align 2, addrspace(5) 813 %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)* 814 store volatile i32 123, i32 addrspace(5)* %bc 815 %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4055 816 %load = load i8, i8 addrspace(5)* %gep 817 %ext = sext i8 %load to i16 818 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 819 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 820 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 821 ret void 822} 823 824; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_zexti8_to_offset: 825; GFX900: buffer_store_dword 826; GFX900-NEXT: buffer_load_ubyte_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4095 827define void @load_private_hi_v2i16_reglo_vreg_zexti8_to_offset(i16 %reg) #0 { 828entry: 829 %obj0 = alloca [10 x i32], align 4, addrspace(5) 830 %obj1 = alloca [4096 x i8], align 2, addrspace(5) 831 %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)* 832 store volatile i32 123, i32 addrspace(5)* %bc 833 %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4055 834 %load = load i8, i8 addrspace(5)* %gep 835 %ext = zext i8 %load to i16 836 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 837 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 838 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 839 ret void 840} 841 842; FIXME: Remove m0 init and waitcnt between reads 843; FIXME: Is there a cost to using the extload over not? 844; GCN-LABEL: {{^}}load_local_v2i16_split_multi_chain: 845; GCN: s_waitcnt 846; GFX900-NEXT: ds_read_u16 v1, v0 847; GFX900-NEXT: s_waitcnt 848; GFX900-NEXT: ds_read_u16_d16_hi v1, v0 offset:2 849; GFX900-NEXT: s_waitcnt 850; GFX900-NEXT: v_mov_b32_e32 v0, v1 851; GFX900-NEXT: s_setpc_b64 852define <2 x i16> @load_local_v2i16_split_multi_chain(i16 addrspace(3)* %in) #0 { 853entry: 854 %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 1 855 %load0 = load volatile i16, i16 addrspace(3)* %in 856 %load1 = load volatile i16, i16 addrspace(3)* %gep 857 %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0 858 %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1 859 ret <2 x i16> %build1 860} 861 862; GCN-LABEL: {{^}}load_local_lo_hi_v2i16_samechain: 863; GFX900: ds_read_u16 v1, v0 864; GFX900-NEXT: s_waitcnt lgkmcnt(0) 865; GFX900-NEXT: ds_read_u16_d16_hi v1, v0 offset:16 866; GFX900-NEXT: s_waitcnt lgkmcnt(0) 867; GFX900-NEXT: v_mov_b32_e32 v0, v1 868; GFX900-NEXT: s_setpc_b64 869 870; NO-D16-HI: ds_read_u16 871; NO-D16-HI: ds_read_u16 872define <2 x i16> @load_local_lo_hi_v2i16_samechain(i16 addrspace(3)* %in) #0 { 873entry: 874 %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 8 875 %load.lo = load i16, i16 addrspace(3)* %in 876 %load.hi = load i16, i16 addrspace(3)* %gep 877 %build0 = insertelement <2 x i16> undef, i16 %load.lo, i32 0 878 %build1 = insertelement <2 x i16> %build0, i16 %load.hi, i32 1 879 ret <2 x i16> %build1 880} 881 882; FIXME: Remove and 883; GCN-LABEL: {{^}}load_local_v2i16_broadcast: 884; GCN: ds_read_u16 [[LOAD:v[0-9]+]] 885; GCN-NOT: ds_read 886; GFX9: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, [[LOAD]] 887; GFX9: v_lshl_or_b32 v0, [[LOAD]], 16, [[AND]] 888define <2 x i16> @load_local_v2i16_broadcast(i16 addrspace(3)* %in) #0 { 889entry: 890 %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 1 891 %load0 = load i16, i16 addrspace(3)* %in 892 %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0 893 %build1 = insertelement <2 x i16> %build0, i16 %load0, i32 1 894 ret <2 x i16> %build1 895} 896 897; GCN-LABEL: {{^}}load_local_lo_hi_v2i16_side_effect: 898; GFX900: ds_read_u16 [[LOAD0:v[0-9]+]], v0 899; GFX900: ds_write_b16 900; GFX900: ds_read_u16_d16_hi [[LOAD0]], v0 offset:16 901 902; NO-D16-HI: ds_read_u16 903; NO-D16-HI: ds_write_b16 904; NO-D16-HI: ds_read_u16 905define <2 x i16> @load_local_lo_hi_v2i16_side_effect(i16 addrspace(3)* %in, i16 addrspace(3)* %may.alias) #0 { 906entry: 907 %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 8 908 %load.lo = load i16, i16 addrspace(3)* %in 909 store i16 123, i16 addrspace(3)* %may.alias 910 %load.hi = load i16, i16 addrspace(3)* %gep 911 %build0 = insertelement <2 x i16> undef, i16 %load.lo, i32 0 912 %build1 = insertelement <2 x i16> %build0, i16 %load.hi, i32 1 913 ret <2 x i16> %build1 914} 915 916; FIXME: Remove waitcnt between reads 917; GCN-LABEL: {{^}}load_global_v2i16_split: 918; GCN: s_waitcnt 919; GFX900-NEXT: global_load_ushort v2 920; GFX900-NEXT: s_waitcnt 921; GFX900-NEXT: global_load_short_d16_hi v2 922; GFX900-NEXT: s_waitcnt 923; GFX900-NEXT: v_mov_b32_e32 v0, v2 924; GFX900-NEXT: s_setpc_b64 925define <2 x i16> @load_global_v2i16_split(i16 addrspace(1)* %in) #0 { 926entry: 927 %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 1 928 %load0 = load volatile i16, i16 addrspace(1)* %in 929 %load1 = load volatile i16, i16 addrspace(1)* %gep 930 %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0 931 %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1 932 ret <2 x i16> %build1 933} 934 935; FIXME: Remove waitcnt between reads 936; GCN-LABEL: {{^}}load_flat_v2i16_split: 937; GCN: s_waitcnt 938; GFX900-NEXT: flat_load_ushort v2 939; GFX900-NEXT: s_waitcnt 940; GFX900-NEXT: flat_load_short_d16_hi v2 941; GFX900-NEXT: s_waitcnt 942; GFX900-NEXT: v_mov_b32_e32 v0, v2 943; GFX900-NEXT: s_setpc_b64 944define <2 x i16> @load_flat_v2i16_split(i16* %in) #0 { 945entry: 946 %gep = getelementptr inbounds i16, i16* %in, i64 1 947 %load0 = load volatile i16, i16* %in 948 %load1 = load volatile i16, i16* %gep 949 %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0 950 %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1 951 ret <2 x i16> %build1 952} 953 954; FIXME: Remove waitcnt between reads 955; GCN-LABEL: {{^}}load_constant_v2i16_split: 956; GCN: s_waitcnt 957; GFX900-NEXT: global_load_ushort v2 958; GFX900-NEXT: s_waitcnt 959; GFX900-NEXT: global_load_short_d16_hi v2 960; GFX900-NEXT: s_waitcnt 961; GFX900-NEXT: v_mov_b32_e32 v0, v2 962; GFX900-NEXT: s_setpc_b64 963define <2 x i16> @load_constant_v2i16_split(i16 addrspace(4)* %in) #0 { 964entry: 965 %gep = getelementptr inbounds i16, i16 addrspace(4)* %in, i64 1 966 %load0 = load volatile i16, i16 addrspace(4)* %in 967 %load1 = load volatile i16, i16 addrspace(4)* %gep 968 %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0 969 %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1 970 ret <2 x i16> %build1 971} 972 973; FIXME: Remove m0 init and waitcnt between reads 974; FIXME: Is there a cost to using the extload over not? 975; GCN-LABEL: {{^}}load_private_v2i16_split: 976; GCN: s_waitcnt 977; GFX900: buffer_load_ushort v0, off, s[0:3], s32{{$}} 978; GFX900-NEXT: s_waitcnt 979; GFX900-NEXT: buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:2 980; GFX900-NEXT: s_waitcnt 981; GFX900-NEXT: s_setpc_b64 982define <2 x i16> @load_private_v2i16_split(i16 addrspace(5)* byval %in) #0 { 983entry: 984 %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i32 1 985 %load0 = load volatile i16, i16 addrspace(5)* %in 986 %load1 = load volatile i16, i16 addrspace(5)* %gep 987 %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0 988 %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1 989 ret <2 x i16> %build1 990} 991 992; FIXME: This test should work without copying of v0. 993; ds_read_u16_d16_hi preserves low 16 bits of the destination 994; and ds_write_b16 only reads low 16 bits. 995; GCN: s_waitcnt 996; GFX900: v_mov_b32_e32 [[COPY:v[0-9]+]], v0 997; GFX900-NEXT: ds_read_u16_d16_hi [[COPY]], v1 998; GFX900-NEXT: ds_write_b16 v1, v0 999; GFX900-NEXT: s_waitcnt 1000; GFX900-NEXT: v_mov_b32_e32 v0, [[COPY]] 1001; GFX900-NEXT: s_waitcnt 1002; GFX900-NEXT: s_setpc_b64 1003define <2 x i16> @load_local_hi_v2i16_store_local_lo(i16 %reg, i16 addrspace(3)* %in) #0 { 1004entry: 1005 %load = load i16, i16 addrspace(3)* %in 1006 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 1007 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1 1008 store volatile i16 %reg, i16 addrspace(3)* %in 1009 ret <2 x i16> %build1 1010} 1011 1012attributes #0 = { nounwind } 1013