1; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-code-object-v3,-promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=HSA -check-prefix=CI %s 2; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-code-object-v3,-promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=HSA -check-prefix=GFX9 %s 3 4; HSA-LABEL: {{^}}use_group_to_flat_addrspacecast: 5; HSA: enable_sgpr_private_segment_buffer = 1 6; HSA: enable_sgpr_dispatch_ptr = 0 7; CI: enable_sgpr_queue_ptr = 1 8; GFX9: enable_sgpr_queue_ptr = 0 9 10; CI-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}} 11; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10{{$}} 12; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]] 13; CI-DAG: v_cmp_ne_u32_e64 vcc, [[PTR]], -1 14; CI-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc 15; CI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] 16; CI-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]] 17 18; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7 19; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}} 20; GFX9-DAG: s_getreg_b32 [[SSRC_SHARED:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 16, 16) 21; GFX9-DAG: s_lshl_b32 [[SSRC_SHARED_BASE:s[0-9]+]], [[SSRC_SHARED]], 16 22; GFX9-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[SSRC_SHARED_BASE]] 23 24; GFX9-XXX: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_shared_base 25; GFX9: v_cmp_ne_u32_e64 vcc, [[PTR]], -1 26; GFX9: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc 27; GFX9-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] 28; GFX9-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]] 29 30; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, [[K]] 31 32; At most 2 digits. Make sure src_shared_base is not counted as a high 33; number SGPR. 34 35; CI: NumSgprs: {{[0-9][0-9]+}} 36; GFX9: NumSgprs: {{[0-9]+}} 37define amdgpu_kernel void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #0 { 38 %stof = addrspacecast i32 addrspace(3)* %ptr to i32* 39 store volatile i32 7, i32* %stof 40 ret void 41} 42 43; HSA-LABEL: {{^}}use_private_to_flat_addrspacecast: 44; HSA: enable_sgpr_private_segment_buffer = 1 45; HSA: enable_sgpr_dispatch_ptr = 0 46; CI: enable_sgpr_queue_ptr = 1 47; GFX9: enable_sgpr_queue_ptr = 0 48 49; CI-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}} 50; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11{{$}} 51; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]] 52 53; CI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7 54; CI-DAG: v_cmp_ne_u32_e64 vcc, [[PTR]], 0 55; CI-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc 56; CI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] 57; CI-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]] 58 59; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}} 60; GFX9-DAG: s_getreg_b32 [[SSRC_PRIVATE:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 0, 16) 61; GFX9-DAG: s_lshl_b32 [[SSRC_PRIVATE_BASE:s[0-9]+]], [[SSRC_PRIVATE]], 16 62; GFX9-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[SSRC_PRIVATE_BASE]] 63 64; GFX9-XXX: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_private_base 65 66; GFX9-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7 67; GFX9: v_cmp_ne_u32_e64 vcc, [[PTR]], 0 68; GFX9: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc 69; GFX9: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] 70; GFX9-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]] 71 72; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, [[K]] 73 74; CI: NumSgprs: {{[0-9][0-9]+}} 75; GFX9: NumSgprs: {{[0-9]+}} 76define amdgpu_kernel void @use_private_to_flat_addrspacecast(i32 addrspace(5)* %ptr) #0 { 77 %stof = addrspacecast i32 addrspace(5)* %ptr to i32* 78 store volatile i32 7, i32* %stof 79 ret void 80} 81 82; no-op 83; HSA-LABEL: {{^}}use_global_to_flat_addrspacecast: 84; HSA: enable_sgpr_queue_ptr = 0 85 86; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}} 87; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]] 88; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]] 89; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7 90; HSA: flat_store_dword v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}, [[K]] 91define amdgpu_kernel void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #0 { 92 %stof = addrspacecast i32 addrspace(1)* %ptr to i32* 93 store volatile i32 7, i32* %stof 94 ret void 95} 96 97; no-op 98; HSA-LABEl: {{^}}use_constant_to_flat_addrspacecast: 99; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}} 100; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]] 101; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]] 102; HSA: flat_load_dword v{{[0-9]+}}, v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}} 103define amdgpu_kernel void @use_constant_to_flat_addrspacecast(i32 addrspace(4)* %ptr) #0 { 104 %stof = addrspacecast i32 addrspace(4)* %ptr to i32* 105 %ld = load volatile i32, i32* %stof 106 ret void 107} 108 109; HSA-LABEL: {{^}}use_flat_to_group_addrspacecast: 110; HSA: enable_sgpr_private_segment_buffer = 1 111; HSA: enable_sgpr_dispatch_ptr = 0 112; HSA: enable_sgpr_queue_ptr = 0 113 114; HSA: s_load_dwordx2 s{{\[}}[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]{{\]}} 115; HSA-DAG: v_cmp_ne_u64_e64 vcc, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0{{$}} 116; HSA-DAG: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], s[[PTR_LO]] 117; HSA-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], -1, v[[VPTR_LO]] 118; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 0{{$}} 119; HSA: ds_write_b32 [[CASTPTR]], v[[K]] 120define amdgpu_kernel void @use_flat_to_group_addrspacecast(i32* %ptr) #0 { 121 %ftos = addrspacecast i32* %ptr to i32 addrspace(3)* 122 store volatile i32 0, i32 addrspace(3)* %ftos 123 ret void 124} 125 126; HSA-LABEL: {{^}}use_flat_to_private_addrspacecast: 127; HSA: enable_sgpr_private_segment_buffer = 1 128; HSA: enable_sgpr_dispatch_ptr = 0 129; HSA: enable_sgpr_queue_ptr = 0 130 131; HSA: s_load_dwordx2 s{{\[}}[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]{{\]}} 132; HSA-DAG: v_cmp_ne_u64_e64 vcc, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0{{$}} 133; HSA-DAG: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], s[[PTR_LO]] 134; HSA-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], 0, v[[VPTR_LO]] 135; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 0{{$}} 136; HSA: buffer_store_dword v[[K]], [[CASTPTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}} 137define amdgpu_kernel void @use_flat_to_private_addrspacecast(i32* %ptr) #0 { 138 %ftos = addrspacecast i32* %ptr to i32 addrspace(5)* 139 store volatile i32 0, i32 addrspace(5)* %ftos 140 ret void 141} 142 143; HSA-LABEL: {{^}}use_flat_to_global_addrspacecast: 144; HSA: enable_sgpr_queue_ptr = 0 145 146; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}}, s[4:5], 0x0 147; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]] 148; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]] 149; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0 150; HSA: {{flat|global}}_store_dword v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}, [[K]] 151define amdgpu_kernel void @use_flat_to_global_addrspacecast(i32* %ptr) #0 { 152 %ftos = addrspacecast i32* %ptr to i32 addrspace(1)* 153 store volatile i32 0, i32 addrspace(1)* %ftos 154 ret void 155} 156 157; HSA-LABEL: {{^}}use_flat_to_constant_addrspacecast: 158; HSA: enable_sgpr_queue_ptr = 0 159 160; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}}, s[4:5], 0x0 161; HSA: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTRLO]]:[[PTRHI]]{{\]}}, 0x0 162define amdgpu_kernel void @use_flat_to_constant_addrspacecast(i32* %ptr) #0 { 163 %ftos = addrspacecast i32* %ptr to i32 addrspace(4)* 164 load volatile i32, i32 addrspace(4)* %ftos 165 ret void 166} 167 168; HSA-LABEL: {{^}}cast_0_group_to_flat_addrspacecast: 169; CI: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10 170; CI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE]] 171; GFX9-DAG: s_getreg_b32 [[SSRC_SHARED:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 16, 16) 172; GFX9-DAG: s_lshl_b32 [[SSRC_SHARED_BASE:s[0-9]+]], [[SSRC_SHARED]], 16 173; GFX9-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SSRC_SHARED_BASE]] 174 175; GFX9-XXX: v_mov_b32_e32 v[[HI:[0-9]+]], src_shared_base 176 177; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} 178; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}} 179; HSA: {{flat|global}}_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]] 180define amdgpu_kernel void @cast_0_group_to_flat_addrspacecast() #0 { 181 %cast = addrspacecast i32 addrspace(3)* null to i32* 182 store volatile i32 7, i32* %cast 183 ret void 184} 185 186; HSA-LABEL: {{^}}cast_0_flat_to_group_addrspacecast: 187; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}} 188; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}} 189; HSA: ds_write_b32 [[PTR]], [[K]] 190define amdgpu_kernel void @cast_0_flat_to_group_addrspacecast() #0 { 191 %cast = addrspacecast i32* null to i32 addrspace(3)* 192 store volatile i32 7, i32 addrspace(3)* %cast 193 ret void 194} 195 196; HSA-LABEL: {{^}}cast_neg1_group_to_flat_addrspacecast: 197; HSA: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} 198; HSA: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}} 199; HSA: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} 200; HSA: {{flat|global}}_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]] 201define amdgpu_kernel void @cast_neg1_group_to_flat_addrspacecast() #0 { 202 %cast = addrspacecast i32 addrspace(3)* inttoptr (i32 -1 to i32 addrspace(3)*) to i32* 203 store volatile i32 7, i32* %cast 204 ret void 205} 206 207; HSA-LABEL: {{^}}cast_neg1_flat_to_group_addrspacecast: 208; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}} 209; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}} 210; HSA: ds_write_b32 [[PTR]], [[K]] 211define amdgpu_kernel void @cast_neg1_flat_to_group_addrspacecast() #0 { 212 %cast = addrspacecast i32* inttoptr (i64 -1 to i32*) to i32 addrspace(3)* 213 store volatile i32 7, i32 addrspace(3)* %cast 214 ret void 215} 216 217; FIXME: Shouldn't need to enable queue ptr 218; HSA-LABEL: {{^}}cast_0_private_to_flat_addrspacecast: 219; CI: enable_sgpr_queue_ptr = 1 220; GFX9: enable_sgpr_queue_ptr = 0 221 222; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} 223; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}} 224; HSA: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} 225; HSA: {{flat|global}}_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]] 226define amdgpu_kernel void @cast_0_private_to_flat_addrspacecast() #0 { 227 %cast = addrspacecast i32 addrspace(5)* null to i32* 228 store volatile i32 7, i32* %cast 229 ret void 230} 231 232; HSA-LABEL: {{^}}cast_0_flat_to_private_addrspacecast: 233; HSA: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}} 234; HSA: buffer_store_dword [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}} 235define amdgpu_kernel void @cast_0_flat_to_private_addrspacecast() #0 { 236 %cast = addrspacecast i32* null to i32 addrspace(5)* 237 store volatile i32 7, i32 addrspace(5)* %cast 238 ret void 239} 240 241; Disable optimizations in case there are optimizations added that 242; specialize away generic pointer accesses. 243 244; HSA-LABEL: {{^}}branch_use_flat_i32: 245; HSA: {{flat|global}}_store_dword {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} 246; HSA: s_endpgm 247define amdgpu_kernel void @branch_use_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %gptr, i32 addrspace(3)* %lptr, i32 %x, i32 %c) #0 { 248entry: 249 %cmp = icmp ne i32 %c, 0 250 br i1 %cmp, label %local, label %global 251 252local: 253 %flat_local = addrspacecast i32 addrspace(3)* %lptr to i32* 254 br label %end 255 256global: 257 %flat_global = addrspacecast i32 addrspace(1)* %gptr to i32* 258 br label %end 259 260end: 261 %fptr = phi i32* [ %flat_local, %local ], [ %flat_global, %global ] 262 store volatile i32 %x, i32* %fptr, align 4 263; %val = load i32, i32* %fptr, align 4 264; store i32 %val, i32 addrspace(1)* %out, align 4 265 ret void 266} 267 268; Check for prologue initializing special SGPRs pointing to scratch. 269; HSA-LABEL: {{^}}store_flat_scratch: 270; CI-DAG: s_mov_b32 flat_scratch_lo, s9 271; CI-DAG: s_add_u32 [[ADD:s[0-9]+]], s8, s11 272; CI: s_lshr_b32 flat_scratch_hi, [[ADD]], 8 273 274; GFX9: s_add_u32 flat_scratch_lo, s6, s9 275; GFX9: s_addc_u32 flat_scratch_hi, s7, 0 276 277; HSA: {{flat|global}}_store_dword 278; HSA: s_barrier 279; HSA: {{flat|global}}_load_dword 280define amdgpu_kernel void @store_flat_scratch(i32 addrspace(1)* noalias %out, i32) #0 { 281 %alloca = alloca i32, i32 9, align 4, addrspace(5) 282 %x = call i32 @llvm.amdgcn.workitem.id.x() #2 283 %pptr = getelementptr i32, i32 addrspace(5)* %alloca, i32 %x 284 %fptr = addrspacecast i32 addrspace(5)* %pptr to i32* 285 store volatile i32 %x, i32* %fptr 286 ; Dummy call 287 call void @llvm.amdgcn.s.barrier() #1 288 %reload = load volatile i32, i32* %fptr, align 4 289 store volatile i32 %reload, i32 addrspace(1)* %out, align 4 290 ret void 291} 292 293declare void @llvm.amdgcn.s.barrier() #1 294declare i32 @llvm.amdgcn.workitem.id.x() #2 295 296attributes #0 = { nounwind } 297attributes #1 = { nounwind convergent } 298attributes #2 = { nounwind readnone } 299