1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=bonaire -mattr=+flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s 3; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s 4; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s 5; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s 6 7; FIXME: Merge with other test. DS offset folding doesn't work due to 8; register bank copies, and no return optimization is missing. 9 10 11declare i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* nocapture, i32, i32, i32, i1) #2 12declare i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* nocapture, i32, i32, i32, i1) #2 13declare i32 @llvm.amdgcn.atomic.inc.i32.p0i32(i32* nocapture, i32, i32, i32, i1) #2 14 15declare i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* nocapture, i64, i32, i32, i1) #2 16declare i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* nocapture, i64, i32, i32, i1) #2 17declare i64 @llvm.amdgcn.atomic.inc.i64.p0i64(i64* nocapture, i64, i32, i32, i1) #2 18 19declare i32 @llvm.amdgcn.workitem.id.x() #1 20 21define amdgpu_kernel void @lds_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 { 22; CI-LABEL: lds_atomic_inc_ret_i32: 23; CI: ; %bb.0: 24; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 25; CI-NEXT: s_load_dword s2, s[4:5], 0x2 26; CI-NEXT: v_mov_b32_e32 v0, 42 27; CI-NEXT: s_mov_b32 m0, -1 28; CI-NEXT: s_waitcnt lgkmcnt(0) 29; CI-NEXT: v_mov_b32_e32 v1, s2 30; CI-NEXT: ds_inc_rtn_u32 v2, v1, v0 31; CI-NEXT: v_mov_b32_e32 v0, s0 32; CI-NEXT: v_mov_b32_e32 v1, s1 33; CI-NEXT: s_waitcnt lgkmcnt(0) 34; CI-NEXT: flat_store_dword v[0:1], v2 35; CI-NEXT: s_endpgm 36; 37; VI-LABEL: lds_atomic_inc_ret_i32: 38; VI: ; %bb.0: 39; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 40; VI-NEXT: s_load_dword s2, s[4:5], 0x8 41; VI-NEXT: v_mov_b32_e32 v0, 42 42; VI-NEXT: s_mov_b32 m0, -1 43; VI-NEXT: s_waitcnt lgkmcnt(0) 44; VI-NEXT: v_mov_b32_e32 v1, s2 45; VI-NEXT: ds_inc_rtn_u32 v2, v1, v0 46; VI-NEXT: v_mov_b32_e32 v0, s0 47; VI-NEXT: v_mov_b32_e32 v1, s1 48; VI-NEXT: s_waitcnt lgkmcnt(0) 49; VI-NEXT: flat_store_dword v[0:1], v2 50; VI-NEXT: s_endpgm 51; 52; GFX9-LABEL: lds_atomic_inc_ret_i32: 53; GFX9: ; %bb.0: 54; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 55; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 56; GFX9-NEXT: v_mov_b32_e32 v1, 42 57; GFX9-NEXT: s_waitcnt lgkmcnt(0) 58; GFX9-NEXT: v_mov_b32_e32 v0, s2 59; GFX9-NEXT: ds_inc_rtn_u32 v0, v0, v1 60; GFX9-NEXT: v_mov_b32_e32 v1, 0 61; GFX9-NEXT: s_waitcnt lgkmcnt(0) 62; GFX9-NEXT: global_store_dword v1, v0, s[0:1] 63; GFX9-NEXT: s_endpgm 64; 65; GFX10-LABEL: lds_atomic_inc_ret_i32: 66; GFX10: ; %bb.0: 67; GFX10-NEXT: s_load_dword s0, s[4:5], 0x8 68; GFX10-NEXT: v_mov_b32_e32 v1, 42 69; GFX10-NEXT: s_waitcnt lgkmcnt(0) 70; GFX10-NEXT: v_mov_b32_e32 v0, s0 71; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 72; GFX10-NEXT: ds_inc_rtn_u32 v0, v0, v1 73; GFX10-NEXT: v_mov_b32_e32 v1, 0 74; GFX10-NEXT: s_waitcnt lgkmcnt(0) 75; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 76; GFX10-NEXT: s_endpgm 77 %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false), !noalias !0 78 store i32 %result, i32 addrspace(1)* %out 79 ret void 80} 81 82!0 = !{!1} 83!1 = distinct !{!1, !2} 84!2 = distinct !{!2} 85 86define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 { 87; CI-LABEL: lds_atomic_inc_ret_i32_offset: 88; CI: ; %bb.0: 89; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 90; CI-NEXT: s_load_dword s2, s[4:5], 0x2 91; CI-NEXT: v_mov_b32_e32 v0, 42 92; CI-NEXT: s_mov_b32 m0, -1 93; CI-NEXT: s_waitcnt lgkmcnt(0) 94; CI-NEXT: v_mov_b32_e32 v1, s2 95; CI-NEXT: ds_inc_rtn_u32 v2, v1, v0 offset:16 96; CI-NEXT: v_mov_b32_e32 v0, s0 97; CI-NEXT: v_mov_b32_e32 v1, s1 98; CI-NEXT: s_waitcnt lgkmcnt(0) 99; CI-NEXT: flat_store_dword v[0:1], v2 100; CI-NEXT: s_endpgm 101; 102; VI-LABEL: lds_atomic_inc_ret_i32_offset: 103; VI: ; %bb.0: 104; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 105; VI-NEXT: s_load_dword s2, s[4:5], 0x8 106; VI-NEXT: v_mov_b32_e32 v0, 42 107; VI-NEXT: s_mov_b32 m0, -1 108; VI-NEXT: s_waitcnt lgkmcnt(0) 109; VI-NEXT: v_mov_b32_e32 v1, s2 110; VI-NEXT: ds_inc_rtn_u32 v2, v1, v0 offset:16 111; VI-NEXT: v_mov_b32_e32 v0, s0 112; VI-NEXT: v_mov_b32_e32 v1, s1 113; VI-NEXT: s_waitcnt lgkmcnt(0) 114; VI-NEXT: flat_store_dword v[0:1], v2 115; VI-NEXT: s_endpgm 116; 117; GFX9-LABEL: lds_atomic_inc_ret_i32_offset: 118; GFX9: ; %bb.0: 119; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 120; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 121; GFX9-NEXT: v_mov_b32_e32 v0, 42 122; GFX9-NEXT: s_waitcnt lgkmcnt(0) 123; GFX9-NEXT: v_mov_b32_e32 v1, s2 124; GFX9-NEXT: ds_inc_rtn_u32 v0, v1, v0 offset:16 125; GFX9-NEXT: v_mov_b32_e32 v1, 0 126; GFX9-NEXT: s_waitcnt lgkmcnt(0) 127; GFX9-NEXT: global_store_dword v1, v0, s[0:1] 128; GFX9-NEXT: s_endpgm 129; 130; GFX10-LABEL: lds_atomic_inc_ret_i32_offset: 131; GFX10: ; %bb.0: 132; GFX10-NEXT: s_load_dword s0, s[4:5], 0x8 133; GFX10-NEXT: v_mov_b32_e32 v0, 42 134; GFX10-NEXT: s_waitcnt lgkmcnt(0) 135; GFX10-NEXT: v_mov_b32_e32 v1, s0 136; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 137; GFX10-NEXT: ds_inc_rtn_u32 v0, v1, v0 offset:16 138; GFX10-NEXT: v_mov_b32_e32 v1, 0 139; GFX10-NEXT: s_waitcnt lgkmcnt(0) 140; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 141; GFX10-NEXT: s_endpgm 142 %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 143 %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %gep, i32 42, i32 0, i32 0, i1 false) 144 store i32 %result, i32 addrspace(1)* %out 145 ret void 146} 147 148define amdgpu_kernel void @lds_atomic_inc_noret_i32(i32 addrspace(3)* %ptr) nounwind { 149; CI-LABEL: lds_atomic_inc_noret_i32: 150; CI: ; %bb.0: 151; CI-NEXT: s_load_dword s0, s[4:5], 0x0 152; CI-NEXT: v_mov_b32_e32 v0, 42 153; CI-NEXT: s_mov_b32 m0, -1 154; CI-NEXT: s_waitcnt lgkmcnt(0) 155; CI-NEXT: v_mov_b32_e32 v1, s0 156; CI-NEXT: ds_inc_rtn_u32 v0, v1, v0 157; CI-NEXT: s_endpgm 158; 159; VI-LABEL: lds_atomic_inc_noret_i32: 160; VI: ; %bb.0: 161; VI-NEXT: s_load_dword s0, s[4:5], 0x0 162; VI-NEXT: v_mov_b32_e32 v0, 42 163; VI-NEXT: s_mov_b32 m0, -1 164; VI-NEXT: s_waitcnt lgkmcnt(0) 165; VI-NEXT: v_mov_b32_e32 v1, s0 166; VI-NEXT: ds_inc_rtn_u32 v0, v1, v0 167; VI-NEXT: s_endpgm 168; 169; GFX9-LABEL: lds_atomic_inc_noret_i32: 170; GFX9: ; %bb.0: 171; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 172; GFX9-NEXT: v_mov_b32_e32 v1, 42 173; GFX9-NEXT: s_waitcnt lgkmcnt(0) 174; GFX9-NEXT: v_mov_b32_e32 v0, s0 175; GFX9-NEXT: ds_inc_rtn_u32 v0, v0, v1 176; GFX9-NEXT: s_endpgm 177; 178; GFX10-LABEL: lds_atomic_inc_noret_i32: 179; GFX10: ; %bb.0: 180; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0 181; GFX10-NEXT: v_mov_b32_e32 v1, 42 182; GFX10-NEXT: s_waitcnt lgkmcnt(0) 183; GFX10-NEXT: v_mov_b32_e32 v0, s0 184; GFX10-NEXT: ds_inc_rtn_u32 v0, v0, v1 185; GFX10-NEXT: s_endpgm 186 %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false) 187 ret void 188} 189 190define amdgpu_kernel void @lds_atomic_inc_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { 191; CI-LABEL: lds_atomic_inc_noret_i32_offset: 192; CI: ; %bb.0: 193; CI-NEXT: s_load_dword s0, s[4:5], 0x0 194; CI-NEXT: v_mov_b32_e32 v0, 42 195; CI-NEXT: s_mov_b32 m0, -1 196; CI-NEXT: s_waitcnt lgkmcnt(0) 197; CI-NEXT: v_mov_b32_e32 v1, s0 198; CI-NEXT: ds_inc_rtn_u32 v0, v1, v0 offset:16 199; CI-NEXT: s_endpgm 200; 201; VI-LABEL: lds_atomic_inc_noret_i32_offset: 202; VI: ; %bb.0: 203; VI-NEXT: s_load_dword s0, s[4:5], 0x0 204; VI-NEXT: v_mov_b32_e32 v0, 42 205; VI-NEXT: s_mov_b32 m0, -1 206; VI-NEXT: s_waitcnt lgkmcnt(0) 207; VI-NEXT: v_mov_b32_e32 v1, s0 208; VI-NEXT: ds_inc_rtn_u32 v0, v1, v0 offset:16 209; VI-NEXT: s_endpgm 210; 211; GFX9-LABEL: lds_atomic_inc_noret_i32_offset: 212; GFX9: ; %bb.0: 213; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 214; GFX9-NEXT: v_mov_b32_e32 v0, 42 215; GFX9-NEXT: s_waitcnt lgkmcnt(0) 216; GFX9-NEXT: v_mov_b32_e32 v1, s0 217; GFX9-NEXT: ds_inc_rtn_u32 v0, v1, v0 offset:16 218; GFX9-NEXT: s_endpgm 219; 220; GFX10-LABEL: lds_atomic_inc_noret_i32_offset: 221; GFX10: ; %bb.0: 222; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0 223; GFX10-NEXT: v_mov_b32_e32 v0, 42 224; GFX10-NEXT: s_waitcnt lgkmcnt(0) 225; GFX10-NEXT: v_mov_b32_e32 v1, s0 226; GFX10-NEXT: ds_inc_rtn_u32 v0, v1, v0 offset:16 227; GFX10-NEXT: s_endpgm 228 %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 229 %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %gep, i32 42, i32 0, i32 0, i1 false) 230 ret void 231} 232 233define amdgpu_kernel void @global_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { 234; CI-LABEL: global_atomic_inc_ret_i32: 235; CI: ; %bb.0: 236; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 237; CI-NEXT: v_mov_b32_e32 v2, 42 238; CI-NEXT: s_waitcnt lgkmcnt(0) 239; CI-NEXT: v_mov_b32_e32 v0, s2 240; CI-NEXT: v_mov_b32_e32 v1, s3 241; CI-NEXT: flat_atomic_inc v2, v[0:1], v2 glc 242; CI-NEXT: v_mov_b32_e32 v0, s0 243; CI-NEXT: v_mov_b32_e32 v1, s1 244; CI-NEXT: s_waitcnt vmcnt(0) 245; CI-NEXT: flat_store_dword v[0:1], v2 246; CI-NEXT: s_endpgm 247; 248; VI-LABEL: global_atomic_inc_ret_i32: 249; VI: ; %bb.0: 250; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 251; VI-NEXT: v_mov_b32_e32 v2, 42 252; VI-NEXT: s_waitcnt lgkmcnt(0) 253; VI-NEXT: v_mov_b32_e32 v0, s2 254; VI-NEXT: v_mov_b32_e32 v1, s3 255; VI-NEXT: flat_atomic_inc v2, v[0:1], v2 glc 256; VI-NEXT: v_mov_b32_e32 v0, s0 257; VI-NEXT: v_mov_b32_e32 v1, s1 258; VI-NEXT: s_waitcnt vmcnt(0) 259; VI-NEXT: flat_store_dword v[0:1], v2 260; VI-NEXT: s_endpgm 261; 262; GFX9-LABEL: global_atomic_inc_ret_i32: 263; GFX9: ; %bb.0: 264; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 265; GFX9-NEXT: v_mov_b32_e32 v0, 42 266; GFX9-NEXT: v_mov_b32_e32 v1, 0 267; GFX9-NEXT: s_waitcnt lgkmcnt(0) 268; GFX9-NEXT: global_atomic_inc v0, v1, v0, s[2:3] glc 269; GFX9-NEXT: s_waitcnt vmcnt(0) 270; GFX9-NEXT: global_store_dword v1, v0, s[0:1] 271; GFX9-NEXT: s_endpgm 272; 273; GFX10-LABEL: global_atomic_inc_ret_i32: 274; GFX10: ; %bb.0: 275; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 276; GFX10-NEXT: v_mov_b32_e32 v0, 42 277; GFX10-NEXT: v_mov_b32_e32 v1, 0 278; GFX10-NEXT: s_waitcnt lgkmcnt(0) 279; GFX10-NEXT: global_atomic_inc v0, v1, v0, s[2:3] glc 280; GFX10-NEXT: s_waitcnt vmcnt(0) 281; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 282; GFX10-NEXT: s_endpgm 283 %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 42, i32 0, i32 0, i1 false) 284 store i32 %result, i32 addrspace(1)* %out 285 ret void 286} 287 288define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { 289; CI-LABEL: global_atomic_inc_ret_i32_offset: 290; CI: ; %bb.0: 291; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 292; CI-NEXT: v_mov_b32_e32 v2, 42 293; CI-NEXT: s_waitcnt lgkmcnt(0) 294; CI-NEXT: s_add_u32 s2, s2, 16 295; CI-NEXT: s_addc_u32 s3, s3, 0 296; CI-NEXT: v_mov_b32_e32 v0, s2 297; CI-NEXT: v_mov_b32_e32 v1, s3 298; CI-NEXT: flat_atomic_inc v2, v[0:1], v2 glc 299; CI-NEXT: v_mov_b32_e32 v0, s0 300; CI-NEXT: v_mov_b32_e32 v1, s1 301; CI-NEXT: s_waitcnt vmcnt(0) 302; CI-NEXT: flat_store_dword v[0:1], v2 303; CI-NEXT: s_endpgm 304; 305; VI-LABEL: global_atomic_inc_ret_i32_offset: 306; VI: ; %bb.0: 307; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 308; VI-NEXT: v_mov_b32_e32 v2, 42 309; VI-NEXT: s_waitcnt lgkmcnt(0) 310; VI-NEXT: s_add_u32 s2, s2, 16 311; VI-NEXT: s_addc_u32 s3, s3, 0 312; VI-NEXT: v_mov_b32_e32 v0, s2 313; VI-NEXT: v_mov_b32_e32 v1, s3 314; VI-NEXT: flat_atomic_inc v2, v[0:1], v2 glc 315; VI-NEXT: v_mov_b32_e32 v0, s0 316; VI-NEXT: v_mov_b32_e32 v1, s1 317; VI-NEXT: s_waitcnt vmcnt(0) 318; VI-NEXT: flat_store_dword v[0:1], v2 319; VI-NEXT: s_endpgm 320; 321; GFX9-LABEL: global_atomic_inc_ret_i32_offset: 322; GFX9: ; %bb.0: 323; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 324; GFX9-NEXT: v_mov_b32_e32 v0, 42 325; GFX9-NEXT: v_mov_b32_e32 v1, 0 326; GFX9-NEXT: s_waitcnt lgkmcnt(0) 327; GFX9-NEXT: global_atomic_inc v0, v1, v0, s[2:3] offset:16 glc 328; GFX9-NEXT: s_waitcnt vmcnt(0) 329; GFX9-NEXT: global_store_dword v1, v0, s[0:1] 330; GFX9-NEXT: s_endpgm 331; 332; GFX10-LABEL: global_atomic_inc_ret_i32_offset: 333; GFX10: ; %bb.0: 334; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 335; GFX10-NEXT: v_mov_b32_e32 v0, 42 336; GFX10-NEXT: v_mov_b32_e32 v1, 0 337; GFX10-NEXT: s_waitcnt lgkmcnt(0) 338; GFX10-NEXT: global_atomic_inc v0, v1, v0, s[2:3] offset:16 glc 339; GFX10-NEXT: s_waitcnt vmcnt(0) 340; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 341; GFX10-NEXT: s_endpgm 342 %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 343 %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false) 344 store i32 %result, i32 addrspace(1)* %out 345 ret void 346} 347 348define amdgpu_kernel void @global_atomic_inc_noret_i32(i32 addrspace(1)* %ptr) nounwind { 349; CI-LABEL: global_atomic_inc_noret_i32: 350; CI: ; %bb.0: 351; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 352; CI-NEXT: v_mov_b32_e32 v2, 42 353; CI-NEXT: s_waitcnt lgkmcnt(0) 354; CI-NEXT: v_mov_b32_e32 v0, s0 355; CI-NEXT: v_mov_b32_e32 v1, s1 356; CI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc 357; CI-NEXT: s_endpgm 358; 359; VI-LABEL: global_atomic_inc_noret_i32: 360; VI: ; %bb.0: 361; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 362; VI-NEXT: v_mov_b32_e32 v2, 42 363; VI-NEXT: s_waitcnt lgkmcnt(0) 364; VI-NEXT: v_mov_b32_e32 v0, s0 365; VI-NEXT: v_mov_b32_e32 v1, s1 366; VI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc 367; VI-NEXT: s_endpgm 368; 369; GFX9-LABEL: global_atomic_inc_noret_i32: 370; GFX9: ; %bb.0: 371; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 372; GFX9-NEXT: v_mov_b32_e32 v0, 42 373; GFX9-NEXT: v_mov_b32_e32 v1, 0 374; GFX9-NEXT: s_waitcnt lgkmcnt(0) 375; GFX9-NEXT: global_atomic_inc v0, v1, v0, s[0:1] glc 376; GFX9-NEXT: s_endpgm 377; 378; GFX10-LABEL: global_atomic_inc_noret_i32: 379; GFX10: ; %bb.0: 380; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 381; GFX10-NEXT: v_mov_b32_e32 v0, 42 382; GFX10-NEXT: v_mov_b32_e32 v1, 0 383; GFX10-NEXT: s_waitcnt lgkmcnt(0) 384; GFX10-NEXT: global_atomic_inc v0, v1, v0, s[0:1] glc 385; GFX10-NEXT: s_endpgm 386 %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 42, i32 0, i32 0, i1 false) 387 ret void 388} 389 390define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(i32 addrspace(1)* %ptr) nounwind { 391; CI-LABEL: global_atomic_inc_noret_i32_offset: 392; CI: ; %bb.0: 393; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 394; CI-NEXT: v_mov_b32_e32 v2, 42 395; CI-NEXT: s_waitcnt lgkmcnt(0) 396; CI-NEXT: s_add_u32 s0, s0, 16 397; CI-NEXT: s_addc_u32 s1, s1, 0 398; CI-NEXT: v_mov_b32_e32 v0, s0 399; CI-NEXT: v_mov_b32_e32 v1, s1 400; CI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc 401; CI-NEXT: s_endpgm 402; 403; VI-LABEL: global_atomic_inc_noret_i32_offset: 404; VI: ; %bb.0: 405; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 406; VI-NEXT: v_mov_b32_e32 v2, 42 407; VI-NEXT: s_waitcnt lgkmcnt(0) 408; VI-NEXT: s_add_u32 s0, s0, 16 409; VI-NEXT: s_addc_u32 s1, s1, 0 410; VI-NEXT: v_mov_b32_e32 v0, s0 411; VI-NEXT: v_mov_b32_e32 v1, s1 412; VI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc 413; VI-NEXT: s_endpgm 414; 415; GFX9-LABEL: global_atomic_inc_noret_i32_offset: 416; GFX9: ; %bb.0: 417; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 418; GFX9-NEXT: v_mov_b32_e32 v0, 42 419; GFX9-NEXT: v_mov_b32_e32 v1, 0 420; GFX9-NEXT: s_waitcnt lgkmcnt(0) 421; GFX9-NEXT: global_atomic_inc v0, v1, v0, s[0:1] offset:16 glc 422; GFX9-NEXT: s_endpgm 423; 424; GFX10-LABEL: global_atomic_inc_noret_i32_offset: 425; GFX10: ; %bb.0: 426; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 427; GFX10-NEXT: v_mov_b32_e32 v0, 42 428; GFX10-NEXT: v_mov_b32_e32 v1, 0 429; GFX10-NEXT: s_waitcnt lgkmcnt(0) 430; GFX10-NEXT: global_atomic_inc v0, v1, v0, s[0:1] offset:16 glc 431; GFX10-NEXT: s_endpgm 432 %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 433 %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false) 434 ret void 435} 436 437define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { 438; CI-LABEL: global_atomic_inc_ret_i32_offset_addr64: 439; CI: ; %bb.0: 440; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 441; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 442; CI-NEXT: s_waitcnt lgkmcnt(0) 443; CI-NEXT: v_mov_b32_e32 v0, s2 444; CI-NEXT: v_mov_b32_e32 v1, s3 445; CI-NEXT: v_add_i32_e32 v3, vcc, v0, v2 446; CI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 447; CI-NEXT: v_mov_b32_e32 v0, s0 448; CI-NEXT: v_mov_b32_e32 v1, s1 449; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 450; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 451; CI-NEXT: v_add_i32_e32 v2, vcc, 20, v3 452; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 453; CI-NEXT: v_mov_b32_e32 v4, 42 454; CI-NEXT: flat_atomic_inc v2, v[2:3], v4 glc 455; CI-NEXT: s_waitcnt vmcnt(0) 456; CI-NEXT: flat_store_dword v[0:1], v2 457; CI-NEXT: s_endpgm 458; 459; VI-LABEL: global_atomic_inc_ret_i32_offset_addr64: 460; VI: ; %bb.0: 461; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 462; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 463; VI-NEXT: s_waitcnt lgkmcnt(0) 464; VI-NEXT: v_mov_b32_e32 v0, s2 465; VI-NEXT: v_mov_b32_e32 v1, s3 466; VI-NEXT: v_add_u32_e32 v3, vcc, v0, v2 467; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 468; VI-NEXT: v_mov_b32_e32 v0, s0 469; VI-NEXT: v_mov_b32_e32 v1, s1 470; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 471; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 472; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v3 473; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 474; VI-NEXT: v_mov_b32_e32 v4, 42 475; VI-NEXT: flat_atomic_inc v2, v[2:3], v4 glc 476; VI-NEXT: s_waitcnt vmcnt(0) 477; VI-NEXT: flat_store_dword v[0:1], v2 478; VI-NEXT: s_endpgm 479; 480; GFX9-LABEL: global_atomic_inc_ret_i32_offset_addr64: 481; GFX9: ; %bb.0: 482; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 483; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 484; GFX9-NEXT: v_mov_b32_e32 v1, 42 485; GFX9-NEXT: s_waitcnt lgkmcnt(0) 486; GFX9-NEXT: global_atomic_inc v1, v0, v1, s[2:3] offset:20 glc 487; GFX9-NEXT: s_waitcnt vmcnt(0) 488; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 489; GFX9-NEXT: s_endpgm 490; 491; GFX10-LABEL: global_atomic_inc_ret_i32_offset_addr64: 492; GFX10: ; %bb.0: 493; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 494; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 495; GFX10-NEXT: v_mov_b32_e32 v1, 42 496; GFX10-NEXT: s_waitcnt lgkmcnt(0) 497; GFX10-NEXT: global_atomic_inc v1, v0, v1, s[2:3] offset:20 glc 498; GFX10-NEXT: s_waitcnt vmcnt(0) 499; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 500; GFX10-NEXT: s_endpgm 501 %id = call i32 @llvm.amdgcn.workitem.id.x() 502 %gep.tid = getelementptr i32, i32 addrspace(1)* %ptr, i32 %id 503 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id 504 %gep = getelementptr i32, i32 addrspace(1)* %gep.tid, i32 5 505 %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false) 506 store i32 %result, i32 addrspace(1)* %out.gep 507 ret void 508} 509 510define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(i32 addrspace(1)* %ptr) #0 { 511; CI-LABEL: global_atomic_inc_noret_i32_offset_addr64: 512; CI: ; %bb.0: 513; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 514; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 515; CI-NEXT: s_waitcnt lgkmcnt(0) 516; CI-NEXT: v_mov_b32_e32 v0, s0 517; CI-NEXT: v_mov_b32_e32 v1, s1 518; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 519; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 520; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0 521; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 522; CI-NEXT: v_mov_b32_e32 v2, 42 523; CI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc 524; CI-NEXT: s_endpgm 525; 526; VI-LABEL: global_atomic_inc_noret_i32_offset_addr64: 527; VI: ; %bb.0: 528; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 529; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 530; VI-NEXT: s_waitcnt lgkmcnt(0) 531; VI-NEXT: v_mov_b32_e32 v0, s0 532; VI-NEXT: v_mov_b32_e32 v1, s1 533; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 534; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 535; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0 536; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 537; VI-NEXT: v_mov_b32_e32 v2, 42 538; VI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc 539; VI-NEXT: s_endpgm 540; 541; GFX9-LABEL: global_atomic_inc_noret_i32_offset_addr64: 542; GFX9: ; %bb.0: 543; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 544; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 545; GFX9-NEXT: v_mov_b32_e32 v1, 42 546; GFX9-NEXT: s_waitcnt lgkmcnt(0) 547; GFX9-NEXT: global_atomic_inc v0, v0, v1, s[0:1] offset:20 glc 548; GFX9-NEXT: s_endpgm 549; 550; GFX10-LABEL: global_atomic_inc_noret_i32_offset_addr64: 551; GFX10: ; %bb.0: 552; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 553; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 554; GFX10-NEXT: v_mov_b32_e32 v1, 42 555; GFX10-NEXT: s_waitcnt lgkmcnt(0) 556; GFX10-NEXT: global_atomic_inc v0, v0, v1, s[0:1] offset:20 glc 557; GFX10-NEXT: s_endpgm 558 %id = call i32 @llvm.amdgcn.workitem.id.x() 559 %gep.tid = getelementptr i32, i32 addrspace(1)* %ptr, i32 %id 560 %gep = getelementptr i32, i32 addrspace(1)* %gep.tid, i32 5 561 %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false) 562 ret void 563} 564 565@lds0 = internal addrspace(3) global [512 x i32] undef, align 4 566 567define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { 568; CI-LABEL: atomic_inc_shl_base_lds_0_i32: 569; CI: ; %bb.0: 570; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 571; CI-NEXT: v_add_i32_e32 v2, vcc, 2, v0 572; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 573; CI-NEXT: v_mov_b32_e32 v1, 9 574; CI-NEXT: s_mov_b32 m0, -1 575; CI-NEXT: ds_inc_rtn_u32 v3, v0, v1 offset:8 576; CI-NEXT: s_waitcnt lgkmcnt(0) 577; CI-NEXT: v_mov_b32_e32 v0, s2 578; CI-NEXT: v_mov_b32_e32 v1, s3 579; CI-NEXT: flat_store_dword v[0:1], v2 580; CI-NEXT: v_mov_b32_e32 v0, s0 581; CI-NEXT: v_mov_b32_e32 v1, s1 582; CI-NEXT: flat_store_dword v[0:1], v3 583; CI-NEXT: s_endpgm 584; 585; VI-LABEL: atomic_inc_shl_base_lds_0_i32: 586; VI: ; %bb.0: 587; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 588; VI-NEXT: v_add_u32_e32 v2, vcc, 2, v0 589; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 590; VI-NEXT: v_mov_b32_e32 v1, 9 591; VI-NEXT: s_mov_b32 m0, -1 592; VI-NEXT: ds_inc_rtn_u32 v3, v0, v1 offset:8 593; VI-NEXT: s_waitcnt lgkmcnt(0) 594; VI-NEXT: v_mov_b32_e32 v0, s2 595; VI-NEXT: v_mov_b32_e32 v1, s3 596; VI-NEXT: flat_store_dword v[0:1], v2 597; VI-NEXT: v_mov_b32_e32 v0, s0 598; VI-NEXT: v_mov_b32_e32 v1, s1 599; VI-NEXT: flat_store_dword v[0:1], v3 600; VI-NEXT: s_endpgm 601; 602; GFX9-LABEL: atomic_inc_shl_base_lds_0_i32: 603; GFX9: ; %bb.0: 604; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 605; GFX9-NEXT: v_add_u32_e32 v1, 2, v0 606; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 607; GFX9-NEXT: v_mov_b32_e32 v2, 9 608; GFX9-NEXT: ds_inc_rtn_u32 v0, v0, v2 offset:8 609; GFX9-NEXT: v_mov_b32_e32 v2, 0 610; GFX9-NEXT: s_waitcnt lgkmcnt(0) 611; GFX9-NEXT: global_store_dword v2, v1, s[2:3] 612; GFX9-NEXT: global_store_dword v2, v0, s[0:1] 613; GFX9-NEXT: s_endpgm 614; 615; GFX10-LABEL: atomic_inc_shl_base_lds_0_i32: 616; GFX10: ; %bb.0: 617; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v0 618; GFX10-NEXT: v_mov_b32_e32 v2, 9 619; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 620; GFX10-NEXT: v_add_nc_u32_e32 v0, 2, v0 621; GFX10-NEXT: ds_inc_rtn_u32 v1, v1, v2 offset:8 622; GFX10-NEXT: v_mov_b32_e32 v2, 0 623; GFX10-NEXT: s_waitcnt lgkmcnt(0) 624; GFX10-NEXT: global_store_dword v2, v0, s[2:3] 625; GFX10-NEXT: global_store_dword v2, v1, s[0:1] 626; GFX10-NEXT: s_endpgm 627 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 628 %idx.0 = add nsw i32 %tid.x, 2 629 %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds0, i32 0, i32 %idx.0 630 %val0 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %arrayidx0, i32 9, i32 0, i32 0, i1 false) 631 store i32 %idx.0, i32 addrspace(1)* %add_use 632 store i32 %val0, i32 addrspace(1)* %out 633 ret void 634} 635 636define amdgpu_kernel void @lds_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 { 637; CI-LABEL: lds_atomic_inc_ret_i64: 638; CI: ; %bb.0: 639; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 640; CI-NEXT: s_load_dword s2, s[4:5], 0x2 641; CI-NEXT: v_mov_b32_e32 v0, 42 642; CI-NEXT: v_mov_b32_e32 v1, 0 643; CI-NEXT: s_mov_b32 m0, -1 644; CI-NEXT: s_waitcnt lgkmcnt(0) 645; CI-NEXT: v_mov_b32_e32 v2, s2 646; CI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] 647; CI-NEXT: v_mov_b32_e32 v3, s1 648; CI-NEXT: v_mov_b32_e32 v2, s0 649; CI-NEXT: s_waitcnt lgkmcnt(0) 650; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 651; CI-NEXT: s_endpgm 652; 653; VI-LABEL: lds_atomic_inc_ret_i64: 654; VI: ; %bb.0: 655; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 656; VI-NEXT: s_load_dword s2, s[4:5], 0x8 657; VI-NEXT: v_mov_b32_e32 v0, 42 658; VI-NEXT: v_mov_b32_e32 v1, 0 659; VI-NEXT: s_mov_b32 m0, -1 660; VI-NEXT: s_waitcnt lgkmcnt(0) 661; VI-NEXT: v_mov_b32_e32 v2, s2 662; VI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] 663; VI-NEXT: v_mov_b32_e32 v3, s1 664; VI-NEXT: v_mov_b32_e32 v2, s0 665; VI-NEXT: s_waitcnt lgkmcnt(0) 666; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 667; VI-NEXT: s_endpgm 668; 669; GFX9-LABEL: lds_atomic_inc_ret_i64: 670; GFX9: ; %bb.0: 671; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 672; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 673; GFX9-NEXT: v_mov_b32_e32 v0, 42 674; GFX9-NEXT: v_mov_b32_e32 v1, 0 675; GFX9-NEXT: s_waitcnt lgkmcnt(0) 676; GFX9-NEXT: v_mov_b32_e32 v2, s2 677; GFX9-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] 678; GFX9-NEXT: v_mov_b32_e32 v2, 0 679; GFX9-NEXT: s_waitcnt lgkmcnt(0) 680; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 681; GFX9-NEXT: s_endpgm 682; 683; GFX10-LABEL: lds_atomic_inc_ret_i64: 684; GFX10: ; %bb.0: 685; GFX10-NEXT: s_load_dword s0, s[4:5], 0x8 686; GFX10-NEXT: v_mov_b32_e32 v0, 42 687; GFX10-NEXT: v_mov_b32_e32 v1, 0 688; GFX10-NEXT: s_waitcnt lgkmcnt(0) 689; GFX10-NEXT: v_mov_b32_e32 v2, s0 690; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 691; GFX10-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] 692; GFX10-NEXT: v_mov_b32_e32 v2, 0 693; GFX10-NEXT: s_waitcnt lgkmcnt(0) 694; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 695; GFX10-NEXT: s_endpgm 696 %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %ptr, i64 42, i32 0, i32 0, i1 false) 697 store i64 %result, i64 addrspace(1)* %out 698 ret void 699} 700 701define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 { 702; CI-LABEL: lds_atomic_inc_ret_i64_offset: 703; CI: ; %bb.0: 704; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 705; CI-NEXT: s_load_dword s2, s[4:5], 0x2 706; CI-NEXT: v_mov_b32_e32 v0, 42 707; CI-NEXT: v_mov_b32_e32 v1, 0 708; CI-NEXT: s_mov_b32 m0, -1 709; CI-NEXT: s_waitcnt lgkmcnt(0) 710; CI-NEXT: v_mov_b32_e32 v2, s2 711; CI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32 712; CI-NEXT: v_mov_b32_e32 v3, s1 713; CI-NEXT: v_mov_b32_e32 v2, s0 714; CI-NEXT: s_waitcnt lgkmcnt(0) 715; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 716; CI-NEXT: s_endpgm 717; 718; VI-LABEL: lds_atomic_inc_ret_i64_offset: 719; VI: ; %bb.0: 720; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 721; VI-NEXT: s_load_dword s2, s[4:5], 0x8 722; VI-NEXT: v_mov_b32_e32 v0, 42 723; VI-NEXT: v_mov_b32_e32 v1, 0 724; VI-NEXT: s_mov_b32 m0, -1 725; VI-NEXT: s_waitcnt lgkmcnt(0) 726; VI-NEXT: v_mov_b32_e32 v2, s2 727; VI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32 728; VI-NEXT: v_mov_b32_e32 v3, s1 729; VI-NEXT: v_mov_b32_e32 v2, s0 730; VI-NEXT: s_waitcnt lgkmcnt(0) 731; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 732; VI-NEXT: s_endpgm 733; 734; GFX9-LABEL: lds_atomic_inc_ret_i64_offset: 735; GFX9: ; %bb.0: 736; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 737; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 738; GFX9-NEXT: v_mov_b32_e32 v0, 42 739; GFX9-NEXT: v_mov_b32_e32 v1, 0 740; GFX9-NEXT: s_waitcnt lgkmcnt(0) 741; GFX9-NEXT: v_mov_b32_e32 v2, s2 742; GFX9-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32 743; GFX9-NEXT: v_mov_b32_e32 v2, 0 744; GFX9-NEXT: s_waitcnt lgkmcnt(0) 745; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 746; GFX9-NEXT: s_endpgm 747; 748; GFX10-LABEL: lds_atomic_inc_ret_i64_offset: 749; GFX10: ; %bb.0: 750; GFX10-NEXT: s_load_dword s0, s[4:5], 0x8 751; GFX10-NEXT: v_mov_b32_e32 v0, 42 752; GFX10-NEXT: v_mov_b32_e32 v1, 0 753; GFX10-NEXT: s_waitcnt lgkmcnt(0) 754; GFX10-NEXT: v_mov_b32_e32 v2, s0 755; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 756; GFX10-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32 757; GFX10-NEXT: v_mov_b32_e32 v2, 0 758; GFX10-NEXT: s_waitcnt lgkmcnt(0) 759; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 760; GFX10-NEXT: s_endpgm 761 %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 762 %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %gep, i64 42, i32 0, i32 0, i1 false) 763 store i64 %result, i64 addrspace(1)* %out 764 ret void 765} 766 767define amdgpu_kernel void @lds_atomic_inc_noret_i64(i64 addrspace(3)* %ptr) nounwind { 768; CI-LABEL: lds_atomic_inc_noret_i64: 769; CI: ; %bb.0: 770; CI-NEXT: s_load_dword s0, s[4:5], 0x0 771; CI-NEXT: v_mov_b32_e32 v0, 42 772; CI-NEXT: v_mov_b32_e32 v1, 0 773; CI-NEXT: s_mov_b32 m0, -1 774; CI-NEXT: s_waitcnt lgkmcnt(0) 775; CI-NEXT: v_mov_b32_e32 v2, s0 776; CI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] 777; CI-NEXT: s_endpgm 778; 779; VI-LABEL: lds_atomic_inc_noret_i64: 780; VI: ; %bb.0: 781; VI-NEXT: s_load_dword s0, s[4:5], 0x0 782; VI-NEXT: v_mov_b32_e32 v0, 42 783; VI-NEXT: v_mov_b32_e32 v1, 0 784; VI-NEXT: s_mov_b32 m0, -1 785; VI-NEXT: s_waitcnt lgkmcnt(0) 786; VI-NEXT: v_mov_b32_e32 v2, s0 787; VI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] 788; VI-NEXT: s_endpgm 789; 790; GFX9-LABEL: lds_atomic_inc_noret_i64: 791; GFX9: ; %bb.0: 792; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 793; GFX9-NEXT: v_mov_b32_e32 v0, 42 794; GFX9-NEXT: v_mov_b32_e32 v1, 0 795; GFX9-NEXT: s_waitcnt lgkmcnt(0) 796; GFX9-NEXT: v_mov_b32_e32 v2, s0 797; GFX9-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] 798; GFX9-NEXT: s_endpgm 799; 800; GFX10-LABEL: lds_atomic_inc_noret_i64: 801; GFX10: ; %bb.0: 802; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0 803; GFX10-NEXT: v_mov_b32_e32 v0, 42 804; GFX10-NEXT: v_mov_b32_e32 v1, 0 805; GFX10-NEXT: s_waitcnt lgkmcnt(0) 806; GFX10-NEXT: v_mov_b32_e32 v2, s0 807; GFX10-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] 808; GFX10-NEXT: s_endpgm 809 %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %ptr, i64 42, i32 0, i32 0, i1 false) 810 ret void 811} 812 813define amdgpu_kernel void @lds_atomic_inc_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { 814; CI-LABEL: lds_atomic_inc_noret_i64_offset: 815; CI: ; %bb.0: 816; CI-NEXT: s_load_dword s0, s[4:5], 0x0 817; CI-NEXT: v_mov_b32_e32 v0, 42 818; CI-NEXT: v_mov_b32_e32 v1, 0 819; CI-NEXT: s_mov_b32 m0, -1 820; CI-NEXT: s_waitcnt lgkmcnt(0) 821; CI-NEXT: v_mov_b32_e32 v2, s0 822; CI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32 823; CI-NEXT: s_endpgm 824; 825; VI-LABEL: lds_atomic_inc_noret_i64_offset: 826; VI: ; %bb.0: 827; VI-NEXT: s_load_dword s0, s[4:5], 0x0 828; VI-NEXT: v_mov_b32_e32 v0, 42 829; VI-NEXT: v_mov_b32_e32 v1, 0 830; VI-NEXT: s_mov_b32 m0, -1 831; VI-NEXT: s_waitcnt lgkmcnt(0) 832; VI-NEXT: v_mov_b32_e32 v2, s0 833; VI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32 834; VI-NEXT: s_endpgm 835; 836; GFX9-LABEL: lds_atomic_inc_noret_i64_offset: 837; GFX9: ; %bb.0: 838; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 839; GFX9-NEXT: v_mov_b32_e32 v0, 42 840; GFX9-NEXT: v_mov_b32_e32 v1, 0 841; GFX9-NEXT: s_waitcnt lgkmcnt(0) 842; GFX9-NEXT: v_mov_b32_e32 v2, s0 843; GFX9-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32 844; GFX9-NEXT: s_endpgm 845; 846; GFX10-LABEL: lds_atomic_inc_noret_i64_offset: 847; GFX10: ; %bb.0: 848; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0 849; GFX10-NEXT: v_mov_b32_e32 v0, 42 850; GFX10-NEXT: v_mov_b32_e32 v1, 0 851; GFX10-NEXT: s_waitcnt lgkmcnt(0) 852; GFX10-NEXT: v_mov_b32_e32 v2, s0 853; GFX10-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32 854; GFX10-NEXT: s_endpgm 855 %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 856 %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %gep, i64 42, i32 0, i32 0, i1 false) 857 ret void 858} 859 860define amdgpu_kernel void @global_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 { 861; CI-LABEL: global_atomic_inc_ret_i64: 862; CI: ; %bb.0: 863; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 864; CI-NEXT: v_mov_b32_e32 v0, 42 865; CI-NEXT: v_mov_b32_e32 v1, 0 866; CI-NEXT: s_waitcnt lgkmcnt(0) 867; CI-NEXT: v_mov_b32_e32 v2, s2 868; CI-NEXT: v_mov_b32_e32 v3, s3 869; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc 870; CI-NEXT: v_mov_b32_e32 v3, s1 871; CI-NEXT: v_mov_b32_e32 v2, s0 872; CI-NEXT: s_waitcnt vmcnt(0) 873; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 874; CI-NEXT: s_endpgm 875; 876; VI-LABEL: global_atomic_inc_ret_i64: 877; VI: ; %bb.0: 878; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 879; VI-NEXT: v_mov_b32_e32 v0, 42 880; VI-NEXT: v_mov_b32_e32 v1, 0 881; VI-NEXT: s_waitcnt lgkmcnt(0) 882; VI-NEXT: v_mov_b32_e32 v2, s2 883; VI-NEXT: v_mov_b32_e32 v3, s3 884; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc 885; VI-NEXT: v_mov_b32_e32 v3, s1 886; VI-NEXT: v_mov_b32_e32 v2, s0 887; VI-NEXT: s_waitcnt vmcnt(0) 888; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 889; VI-NEXT: s_endpgm 890; 891; GFX9-LABEL: global_atomic_inc_ret_i64: 892; GFX9: ; %bb.0: 893; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 894; GFX9-NEXT: v_mov_b32_e32 v0, 42 895; GFX9-NEXT: v_mov_b32_e32 v1, 0 896; GFX9-NEXT: v_mov_b32_e32 v2, 0 897; GFX9-NEXT: s_waitcnt lgkmcnt(0) 898; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[2:3] glc 899; GFX9-NEXT: s_waitcnt vmcnt(0) 900; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 901; GFX9-NEXT: s_endpgm 902; 903; GFX10-LABEL: global_atomic_inc_ret_i64: 904; GFX10: ; %bb.0: 905; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 906; GFX10-NEXT: v_mov_b32_e32 v0, 42 907; GFX10-NEXT: v_mov_b32_e32 v1, 0 908; GFX10-NEXT: v_mov_b32_e32 v2, 0 909; GFX10-NEXT: s_waitcnt lgkmcnt(0) 910; GFX10-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[2:3] glc 911; GFX10-NEXT: s_waitcnt vmcnt(0) 912; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 913; GFX10-NEXT: s_endpgm 914 %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 42, i32 0, i32 0, i1 false) 915 store i64 %result, i64 addrspace(1)* %out 916 ret void 917} 918 919define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 { 920; CI-LABEL: global_atomic_inc_ret_i64_offset: 921; CI: ; %bb.0: 922; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 923; CI-NEXT: v_mov_b32_e32 v0, 42 924; CI-NEXT: v_mov_b32_e32 v1, 0 925; CI-NEXT: s_waitcnt lgkmcnt(0) 926; CI-NEXT: s_add_u32 s2, s2, 32 927; CI-NEXT: s_addc_u32 s3, s3, 0 928; CI-NEXT: v_mov_b32_e32 v2, s2 929; CI-NEXT: v_mov_b32_e32 v3, s3 930; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc 931; CI-NEXT: v_mov_b32_e32 v3, s1 932; CI-NEXT: v_mov_b32_e32 v2, s0 933; CI-NEXT: s_waitcnt vmcnt(0) 934; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 935; CI-NEXT: s_endpgm 936; 937; VI-LABEL: global_atomic_inc_ret_i64_offset: 938; VI: ; %bb.0: 939; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 940; VI-NEXT: v_mov_b32_e32 v0, 42 941; VI-NEXT: v_mov_b32_e32 v1, 0 942; VI-NEXT: s_waitcnt lgkmcnt(0) 943; VI-NEXT: s_add_u32 s2, s2, 32 944; VI-NEXT: s_addc_u32 s3, s3, 0 945; VI-NEXT: v_mov_b32_e32 v2, s2 946; VI-NEXT: v_mov_b32_e32 v3, s3 947; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc 948; VI-NEXT: v_mov_b32_e32 v3, s1 949; VI-NEXT: v_mov_b32_e32 v2, s0 950; VI-NEXT: s_waitcnt vmcnt(0) 951; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 952; VI-NEXT: s_endpgm 953; 954; GFX9-LABEL: global_atomic_inc_ret_i64_offset: 955; GFX9: ; %bb.0: 956; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 957; GFX9-NEXT: v_mov_b32_e32 v0, 42 958; GFX9-NEXT: v_mov_b32_e32 v1, 0 959; GFX9-NEXT: v_mov_b32_e32 v2, 0 960; GFX9-NEXT: s_waitcnt lgkmcnt(0) 961; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[2:3] offset:32 glc 962; GFX9-NEXT: s_waitcnt vmcnt(0) 963; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 964; GFX9-NEXT: s_endpgm 965; 966; GFX10-LABEL: global_atomic_inc_ret_i64_offset: 967; GFX10: ; %bb.0: 968; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 969; GFX10-NEXT: v_mov_b32_e32 v0, 42 970; GFX10-NEXT: v_mov_b32_e32 v1, 0 971; GFX10-NEXT: v_mov_b32_e32 v2, 0 972; GFX10-NEXT: s_waitcnt lgkmcnt(0) 973; GFX10-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[2:3] offset:32 glc 974; GFX10-NEXT: s_waitcnt vmcnt(0) 975; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 976; GFX10-NEXT: s_endpgm 977 %gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4 978 %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false) 979 store i64 %result, i64 addrspace(1)* %out 980 ret void 981} 982 983define amdgpu_kernel void @global_atomic_inc_noret_i64(i64 addrspace(1)* %ptr) nounwind { 984; CI-LABEL: global_atomic_inc_noret_i64: 985; CI: ; %bb.0: 986; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 987; CI-NEXT: v_mov_b32_e32 v0, 42 988; CI-NEXT: v_mov_b32_e32 v1, 0 989; CI-NEXT: s_waitcnt lgkmcnt(0) 990; CI-NEXT: v_mov_b32_e32 v3, s1 991; CI-NEXT: v_mov_b32_e32 v2, s0 992; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc 993; CI-NEXT: s_endpgm 994; 995; VI-LABEL: global_atomic_inc_noret_i64: 996; VI: ; %bb.0: 997; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 998; VI-NEXT: v_mov_b32_e32 v0, 42 999; VI-NEXT: v_mov_b32_e32 v1, 0 1000; VI-NEXT: s_waitcnt lgkmcnt(0) 1001; VI-NEXT: v_mov_b32_e32 v3, s1 1002; VI-NEXT: v_mov_b32_e32 v2, s0 1003; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc 1004; VI-NEXT: s_endpgm 1005; 1006; GFX9-LABEL: global_atomic_inc_noret_i64: 1007; GFX9: ; %bb.0: 1008; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1009; GFX9-NEXT: v_mov_b32_e32 v0, 42 1010; GFX9-NEXT: v_mov_b32_e32 v1, 0 1011; GFX9-NEXT: v_mov_b32_e32 v2, 0 1012; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1013; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[0:1] glc 1014; GFX9-NEXT: s_endpgm 1015; 1016; GFX10-LABEL: global_atomic_inc_noret_i64: 1017; GFX10: ; %bb.0: 1018; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1019; GFX10-NEXT: v_mov_b32_e32 v0, 42 1020; GFX10-NEXT: v_mov_b32_e32 v1, 0 1021; GFX10-NEXT: v_mov_b32_e32 v2, 0 1022; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1023; GFX10-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[0:1] glc 1024; GFX10-NEXT: s_endpgm 1025 %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 42, i32 0, i32 0, i1 false) 1026 ret void 1027} 1028 1029define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(i64 addrspace(1)* %ptr) nounwind { 1030; CI-LABEL: global_atomic_inc_noret_i64_offset: 1031; CI: ; %bb.0: 1032; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1033; CI-NEXT: v_mov_b32_e32 v0, 42 1034; CI-NEXT: v_mov_b32_e32 v1, 0 1035; CI-NEXT: s_waitcnt lgkmcnt(0) 1036; CI-NEXT: s_add_u32 s0, s0, 32 1037; CI-NEXT: s_addc_u32 s1, s1, 0 1038; CI-NEXT: v_mov_b32_e32 v3, s1 1039; CI-NEXT: v_mov_b32_e32 v2, s0 1040; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc 1041; CI-NEXT: s_endpgm 1042; 1043; VI-LABEL: global_atomic_inc_noret_i64_offset: 1044; VI: ; %bb.0: 1045; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1046; VI-NEXT: v_mov_b32_e32 v0, 42 1047; VI-NEXT: v_mov_b32_e32 v1, 0 1048; VI-NEXT: s_waitcnt lgkmcnt(0) 1049; VI-NEXT: s_add_u32 s0, s0, 32 1050; VI-NEXT: s_addc_u32 s1, s1, 0 1051; VI-NEXT: v_mov_b32_e32 v3, s1 1052; VI-NEXT: v_mov_b32_e32 v2, s0 1053; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc 1054; VI-NEXT: s_endpgm 1055; 1056; GFX9-LABEL: global_atomic_inc_noret_i64_offset: 1057; GFX9: ; %bb.0: 1058; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1059; GFX9-NEXT: v_mov_b32_e32 v0, 42 1060; GFX9-NEXT: v_mov_b32_e32 v1, 0 1061; GFX9-NEXT: v_mov_b32_e32 v2, 0 1062; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1063; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc 1064; GFX9-NEXT: s_endpgm 1065; 1066; GFX10-LABEL: global_atomic_inc_noret_i64_offset: 1067; GFX10: ; %bb.0: 1068; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1069; GFX10-NEXT: v_mov_b32_e32 v0, 42 1070; GFX10-NEXT: v_mov_b32_e32 v1, 0 1071; GFX10-NEXT: v_mov_b32_e32 v2, 0 1072; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1073; GFX10-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc 1074; GFX10-NEXT: s_endpgm 1075 %gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4 1076 %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false) 1077 ret void 1078} 1079 1080define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 { 1081; CI-LABEL: global_atomic_inc_ret_i64_offset_addr64: 1082; CI: ; %bb.0: 1083; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1084; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1085; CI-NEXT: s_waitcnt lgkmcnt(0) 1086; CI-NEXT: v_mov_b32_e32 v0, s2 1087; CI-NEXT: v_mov_b32_e32 v1, s3 1088; CI-NEXT: v_add_i32_e32 v4, vcc, v0, v2 1089; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 1090; CI-NEXT: v_mov_b32_e32 v0, s0 1091; CI-NEXT: v_mov_b32_e32 v1, s1 1092; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 1093; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1094; CI-NEXT: v_mov_b32_e32 v2, 42 1095; CI-NEXT: v_add_i32_e32 v4, vcc, 40, v4 1096; CI-NEXT: v_mov_b32_e32 v3, 0 1097; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc 1098; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[4:5], v[2:3] glc 1099; CI-NEXT: s_waitcnt vmcnt(0) 1100; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 1101; CI-NEXT: s_endpgm 1102; 1103; VI-LABEL: global_atomic_inc_ret_i64_offset_addr64: 1104; VI: ; %bb.0: 1105; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1106; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1107; VI-NEXT: s_waitcnt lgkmcnt(0) 1108; VI-NEXT: v_mov_b32_e32 v0, s2 1109; VI-NEXT: v_mov_b32_e32 v1, s3 1110; VI-NEXT: v_add_u32_e32 v4, vcc, v0, v2 1111; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 1112; VI-NEXT: v_mov_b32_e32 v0, s0 1113; VI-NEXT: v_mov_b32_e32 v1, s1 1114; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 1115; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1116; VI-NEXT: v_mov_b32_e32 v2, 42 1117; VI-NEXT: v_add_u32_e32 v4, vcc, 40, v4 1118; VI-NEXT: v_mov_b32_e32 v3, 0 1119; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc 1120; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[4:5], v[2:3] glc 1121; VI-NEXT: s_waitcnt vmcnt(0) 1122; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 1123; VI-NEXT: s_endpgm 1124; 1125; GFX9-LABEL: global_atomic_inc_ret_i64_offset_addr64: 1126; GFX9: ; %bb.0: 1127; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1128; GFX9-NEXT: v_mov_b32_e32 v1, 42 1129; GFX9-NEXT: v_mov_b32_e32 v2, 0 1130; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0 1131; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1132; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v3, v[1:2], s[2:3] offset:40 glc 1133; GFX9-NEXT: s_waitcnt vmcnt(0) 1134; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] 1135; GFX9-NEXT: s_endpgm 1136; 1137; GFX10-LABEL: global_atomic_inc_ret_i64_offset_addr64: 1138; GFX10: ; %bb.0: 1139; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1140; GFX10-NEXT: v_mov_b32_e32 v1, 42 1141; GFX10-NEXT: v_mov_b32_e32 v2, 0 1142; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0 1143; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1144; GFX10-NEXT: global_atomic_inc_x2 v[0:1], v3, v[1:2], s[2:3] offset:40 glc 1145; GFX10-NEXT: s_waitcnt vmcnt(0) 1146; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] 1147; GFX10-NEXT: s_endpgm 1148 %id = call i32 @llvm.amdgcn.workitem.id.x() 1149 %gep.tid = getelementptr i64, i64 addrspace(1)* %ptr, i32 %id 1150 %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id 1151 %gep = getelementptr i64, i64 addrspace(1)* %gep.tid, i32 5 1152 %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false) 1153 store i64 %result, i64 addrspace(1)* %out.gep 1154 ret void 1155} 1156 1157define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(i64 addrspace(1)* %ptr) #0 { 1158; CI-LABEL: global_atomic_inc_noret_i64_offset_addr64: 1159; CI: ; %bb.0: 1160; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1161; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1162; CI-NEXT: s_waitcnt lgkmcnt(0) 1163; CI-NEXT: v_mov_b32_e32 v0, s0 1164; CI-NEXT: v_mov_b32_e32 v1, s1 1165; CI-NEXT: v_add_i32_e32 v2, vcc, v0, v2 1166; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 1167; CI-NEXT: v_mov_b32_e32 v0, 42 1168; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v2 1169; CI-NEXT: v_mov_b32_e32 v1, 0 1170; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1171; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc 1172; CI-NEXT: s_endpgm 1173; 1174; VI-LABEL: global_atomic_inc_noret_i64_offset_addr64: 1175; VI: ; %bb.0: 1176; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1177; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1178; VI-NEXT: s_waitcnt lgkmcnt(0) 1179; VI-NEXT: v_mov_b32_e32 v0, s0 1180; VI-NEXT: v_mov_b32_e32 v1, s1 1181; VI-NEXT: v_add_u32_e32 v2, vcc, v0, v2 1182; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 1183; VI-NEXT: v_mov_b32_e32 v0, 42 1184; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v2 1185; VI-NEXT: v_mov_b32_e32 v1, 0 1186; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1187; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc 1188; VI-NEXT: s_endpgm 1189; 1190; GFX9-LABEL: global_atomic_inc_noret_i64_offset_addr64: 1191; GFX9: ; %bb.0: 1192; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1193; GFX9-NEXT: v_mov_b32_e32 v1, 42 1194; GFX9-NEXT: v_mov_b32_e32 v2, 0 1195; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 1196; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1197; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v0, v[1:2], s[0:1] offset:40 glc 1198; GFX9-NEXT: s_endpgm 1199; 1200; GFX10-LABEL: global_atomic_inc_noret_i64_offset_addr64: 1201; GFX10: ; %bb.0: 1202; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1203; GFX10-NEXT: v_mov_b32_e32 v1, 42 1204; GFX10-NEXT: v_mov_b32_e32 v2, 0 1205; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 1206; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1207; GFX10-NEXT: global_atomic_inc_x2 v[0:1], v0, v[1:2], s[0:1] offset:40 glc 1208; GFX10-NEXT: s_endpgm 1209 %id = call i32 @llvm.amdgcn.workitem.id.x() 1210 %gep.tid = getelementptr i64, i64 addrspace(1)* %ptr, i32 %id 1211 %gep = getelementptr i64, i64 addrspace(1)* %gep.tid, i32 5 1212 %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false) 1213 ret void 1214} 1215 1216define amdgpu_kernel void @flat_atomic_inc_ret_i32(i32* %out, i32* %ptr) #0 { 1217; GCN-LABEL: flat_atomic_inc_ret_i32: 1218; GCN: ; %bb.0: 1219; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1220; GCN-NEXT: v_mov_b32_e32 v2, 42 1221; GCN-NEXT: s_waitcnt lgkmcnt(0) 1222; GCN-NEXT: v_mov_b32_e32 v0, s2 1223; GCN-NEXT: v_mov_b32_e32 v1, s3 1224; GCN-NEXT: flat_atomic_inc v2, v[0:1], v2 glc 1225; GCN-NEXT: v_mov_b32_e32 v0, s0 1226; GCN-NEXT: v_mov_b32_e32 v1, s1 1227; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1228; GCN-NEXT: flat_store_dword v[0:1], v2 1229; GCN-NEXT: s_endpgm 1230 %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0i32(i32* %ptr, i32 42, i32 0, i32 0, i1 false) 1231 store i32 %result, i32* %out 1232 ret void 1233} 1234 1235define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(i32* %out, i32* %ptr) #0 { 1236; CI-LABEL: flat_atomic_inc_ret_i32_offset: 1237; CI: ; %bb.0: 1238; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1239; CI-NEXT: v_mov_b32_e32 v2, 42 1240; CI-NEXT: s_waitcnt lgkmcnt(0) 1241; CI-NEXT: s_add_u32 s2, s2, 16 1242; CI-NEXT: s_addc_u32 s3, s3, 0 1243; CI-NEXT: v_mov_b32_e32 v0, s2 1244; CI-NEXT: v_mov_b32_e32 v1, s3 1245; CI-NEXT: flat_atomic_inc v2, v[0:1], v2 glc 1246; CI-NEXT: v_mov_b32_e32 v0, s0 1247; CI-NEXT: v_mov_b32_e32 v1, s1 1248; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1249; CI-NEXT: flat_store_dword v[0:1], v2 1250; CI-NEXT: s_endpgm 1251; 1252; VI-LABEL: flat_atomic_inc_ret_i32_offset: 1253; VI: ; %bb.0: 1254; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1255; VI-NEXT: v_mov_b32_e32 v2, 42 1256; VI-NEXT: s_waitcnt lgkmcnt(0) 1257; VI-NEXT: s_add_u32 s2, s2, 16 1258; VI-NEXT: s_addc_u32 s3, s3, 0 1259; VI-NEXT: v_mov_b32_e32 v0, s2 1260; VI-NEXT: v_mov_b32_e32 v1, s3 1261; VI-NEXT: flat_atomic_inc v2, v[0:1], v2 glc 1262; VI-NEXT: v_mov_b32_e32 v0, s0 1263; VI-NEXT: v_mov_b32_e32 v1, s1 1264; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1265; VI-NEXT: flat_store_dword v[0:1], v2 1266; VI-NEXT: s_endpgm 1267; 1268; GFX9-LABEL: flat_atomic_inc_ret_i32_offset: 1269; GFX9: ; %bb.0: 1270; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1271; GFX9-NEXT: v_mov_b32_e32 v2, 42 1272; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1273; GFX9-NEXT: v_mov_b32_e32 v0, s2 1274; GFX9-NEXT: v_mov_b32_e32 v1, s3 1275; GFX9-NEXT: flat_atomic_inc v2, v[0:1], v2 offset:16 glc 1276; GFX9-NEXT: v_mov_b32_e32 v0, s0 1277; GFX9-NEXT: v_mov_b32_e32 v1, s1 1278; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1279; GFX9-NEXT: flat_store_dword v[0:1], v2 1280; GFX9-NEXT: s_endpgm 1281; 1282; GFX10-LABEL: flat_atomic_inc_ret_i32_offset: 1283; GFX10: ; %bb.0: 1284; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1285; GFX10-NEXT: v_mov_b32_e32 v2, 42 1286; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1287; GFX10-NEXT: s_add_u32 s2, s2, 16 1288; GFX10-NEXT: s_addc_u32 s3, s3, 0 1289; GFX10-NEXT: v_mov_b32_e32 v0, s2 1290; GFX10-NEXT: v_mov_b32_e32 v1, s3 1291; GFX10-NEXT: flat_atomic_inc v2, v[0:1], v2 glc 1292; GFX10-NEXT: v_mov_b32_e32 v0, s0 1293; GFX10-NEXT: v_mov_b32_e32 v1, s1 1294; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1295; GFX10-NEXT: flat_store_dword v[0:1], v2 1296; GFX10-NEXT: s_endpgm 1297 %gep = getelementptr i32, i32* %ptr, i32 4 1298 %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0i32(i32* %gep, i32 42, i32 0, i32 0, i1 false) 1299 store i32 %result, i32* %out 1300 ret void 1301} 1302 1303define amdgpu_kernel void @flat_atomic_inc_noret_i32(i32* %ptr) nounwind { 1304; GCN-LABEL: flat_atomic_inc_noret_i32: 1305; GCN: ; %bb.0: 1306; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1307; GCN-NEXT: v_mov_b32_e32 v2, 42 1308; GCN-NEXT: s_waitcnt lgkmcnt(0) 1309; GCN-NEXT: v_mov_b32_e32 v0, s0 1310; GCN-NEXT: v_mov_b32_e32 v1, s1 1311; GCN-NEXT: flat_atomic_inc v0, v[0:1], v2 glc 1312; GCN-NEXT: s_endpgm 1313 %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0i32(i32* %ptr, i32 42, i32 0, i32 0, i1 false) 1314 ret void 1315} 1316 1317define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(i32* %ptr) nounwind { 1318; CI-LABEL: flat_atomic_inc_noret_i32_offset: 1319; CI: ; %bb.0: 1320; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1321; CI-NEXT: v_mov_b32_e32 v2, 42 1322; CI-NEXT: s_waitcnt lgkmcnt(0) 1323; CI-NEXT: s_add_u32 s0, s0, 16 1324; CI-NEXT: s_addc_u32 s1, s1, 0 1325; CI-NEXT: v_mov_b32_e32 v0, s0 1326; CI-NEXT: v_mov_b32_e32 v1, s1 1327; CI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc 1328; CI-NEXT: s_endpgm 1329; 1330; VI-LABEL: flat_atomic_inc_noret_i32_offset: 1331; VI: ; %bb.0: 1332; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1333; VI-NEXT: v_mov_b32_e32 v2, 42 1334; VI-NEXT: s_waitcnt lgkmcnt(0) 1335; VI-NEXT: s_add_u32 s0, s0, 16 1336; VI-NEXT: s_addc_u32 s1, s1, 0 1337; VI-NEXT: v_mov_b32_e32 v0, s0 1338; VI-NEXT: v_mov_b32_e32 v1, s1 1339; VI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc 1340; VI-NEXT: s_endpgm 1341; 1342; GFX9-LABEL: flat_atomic_inc_noret_i32_offset: 1343; GFX9: ; %bb.0: 1344; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1345; GFX9-NEXT: v_mov_b32_e32 v2, 42 1346; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1347; GFX9-NEXT: v_mov_b32_e32 v0, s0 1348; GFX9-NEXT: v_mov_b32_e32 v1, s1 1349; GFX9-NEXT: flat_atomic_inc v0, v[0:1], v2 offset:16 glc 1350; GFX9-NEXT: s_endpgm 1351; 1352; GFX10-LABEL: flat_atomic_inc_noret_i32_offset: 1353; GFX10: ; %bb.0: 1354; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1355; GFX10-NEXT: v_mov_b32_e32 v2, 42 1356; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1357; GFX10-NEXT: s_add_u32 s0, s0, 16 1358; GFX10-NEXT: s_addc_u32 s1, s1, 0 1359; GFX10-NEXT: v_mov_b32_e32 v0, s0 1360; GFX10-NEXT: v_mov_b32_e32 v1, s1 1361; GFX10-NEXT: flat_atomic_inc v0, v[0:1], v2 glc 1362; GFX10-NEXT: s_endpgm 1363 %gep = getelementptr i32, i32* %ptr, i32 4 1364 %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0i32(i32* %gep, i32 42, i32 0, i32 0, i1 false) 1365 ret void 1366} 1367 1368define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(i32* %out, i32* %ptr) #0 { 1369; CI-LABEL: flat_atomic_inc_ret_i32_offset_addr64: 1370; CI: ; %bb.0: 1371; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1372; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1373; CI-NEXT: s_waitcnt lgkmcnt(0) 1374; CI-NEXT: v_mov_b32_e32 v0, s2 1375; CI-NEXT: v_mov_b32_e32 v1, s3 1376; CI-NEXT: v_add_i32_e32 v3, vcc, v0, v2 1377; CI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 1378; CI-NEXT: v_mov_b32_e32 v0, s0 1379; CI-NEXT: v_mov_b32_e32 v1, s1 1380; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 1381; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1382; CI-NEXT: v_add_i32_e32 v2, vcc, 20, v3 1383; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 1384; CI-NEXT: v_mov_b32_e32 v4, 42 1385; CI-NEXT: flat_atomic_inc v2, v[2:3], v4 glc 1386; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1387; CI-NEXT: flat_store_dword v[0:1], v2 1388; CI-NEXT: s_endpgm 1389; 1390; VI-LABEL: flat_atomic_inc_ret_i32_offset_addr64: 1391; VI: ; %bb.0: 1392; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1393; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1394; VI-NEXT: s_waitcnt lgkmcnt(0) 1395; VI-NEXT: v_mov_b32_e32 v0, s2 1396; VI-NEXT: v_mov_b32_e32 v1, s3 1397; VI-NEXT: v_add_u32_e32 v3, vcc, v0, v2 1398; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 1399; VI-NEXT: v_mov_b32_e32 v0, s0 1400; VI-NEXT: v_mov_b32_e32 v1, s1 1401; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 1402; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1403; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v3 1404; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 1405; VI-NEXT: v_mov_b32_e32 v4, 42 1406; VI-NEXT: flat_atomic_inc v2, v[2:3], v4 glc 1407; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1408; VI-NEXT: flat_store_dword v[0:1], v2 1409; VI-NEXT: s_endpgm 1410; 1411; GFX9-LABEL: flat_atomic_inc_ret_i32_offset_addr64: 1412; GFX9: ; %bb.0: 1413; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1414; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0 1415; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1416; GFX9-NEXT: v_mov_b32_e32 v0, s2 1417; GFX9-NEXT: v_mov_b32_e32 v1, s3 1418; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 1419; GFX9-NEXT: v_mov_b32_e32 v3, s1 1420; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 1421; GFX9-NEXT: v_mov_b32_e32 v2, s0 1422; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 1423; GFX9-NEXT: v_mov_b32_e32 v4, 42 1424; GFX9-NEXT: flat_atomic_inc v0, v[0:1], v4 offset:20 glc 1425; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 1426; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1427; GFX9-NEXT: flat_store_dword v[2:3], v0 1428; GFX9-NEXT: s_endpgm 1429; 1430; GFX10-LABEL: flat_atomic_inc_ret_i32_offset_addr64: 1431; GFX10: ; %bb.0: 1432; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1433; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1434; GFX10-NEXT: v_mov_b32_e32 v3, 42 1435; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1436; GFX10-NEXT: v_mov_b32_e32 v0, s2 1437; GFX10-NEXT: v_mov_b32_e32 v1, s3 1438; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 1439; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 1440; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, 20 1441; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 1442; GFX10-NEXT: flat_atomic_inc v3, v[0:1], v3 glc 1443; GFX10-NEXT: v_mov_b32_e32 v0, s0 1444; GFX10-NEXT: v_mov_b32_e32 v1, s1 1445; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 1446; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 1447; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1448; GFX10-NEXT: flat_store_dword v[0:1], v3 1449; GFX10-NEXT: s_endpgm 1450 %id = call i32 @llvm.amdgcn.workitem.id.x() 1451 %gep.tid = getelementptr i32, i32* %ptr, i32 %id 1452 %out.gep = getelementptr i32, i32* %out, i32 %id 1453 %gep = getelementptr i32, i32* %gep.tid, i32 5 1454 %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0i32(i32* %gep, i32 42, i32 0, i32 0, i1 false) 1455 store i32 %result, i32* %out.gep 1456 ret void 1457} 1458 1459define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(i32* %ptr) #0 { 1460; CI-LABEL: flat_atomic_inc_noret_i32_offset_addr64: 1461; CI: ; %bb.0: 1462; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1463; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1464; CI-NEXT: s_waitcnt lgkmcnt(0) 1465; CI-NEXT: v_mov_b32_e32 v0, s0 1466; CI-NEXT: v_mov_b32_e32 v1, s1 1467; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 1468; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1469; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0 1470; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1471; CI-NEXT: v_mov_b32_e32 v2, 42 1472; CI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc 1473; CI-NEXT: s_endpgm 1474; 1475; VI-LABEL: flat_atomic_inc_noret_i32_offset_addr64: 1476; VI: ; %bb.0: 1477; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1478; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1479; VI-NEXT: s_waitcnt lgkmcnt(0) 1480; VI-NEXT: v_mov_b32_e32 v0, s0 1481; VI-NEXT: v_mov_b32_e32 v1, s1 1482; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 1483; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1484; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0 1485; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1486; VI-NEXT: v_mov_b32_e32 v2, 42 1487; VI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc 1488; VI-NEXT: s_endpgm 1489; 1490; GFX9-LABEL: flat_atomic_inc_noret_i32_offset_addr64: 1491; GFX9: ; %bb.0: 1492; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1493; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1494; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1495; GFX9-NEXT: v_mov_b32_e32 v0, s0 1496; GFX9-NEXT: v_mov_b32_e32 v1, s1 1497; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 1498; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 1499; GFX9-NEXT: v_mov_b32_e32 v2, 42 1500; GFX9-NEXT: flat_atomic_inc v0, v[0:1], v2 offset:20 glc 1501; GFX9-NEXT: s_endpgm 1502; 1503; GFX10-LABEL: flat_atomic_inc_noret_i32_offset_addr64: 1504; GFX10: ; %bb.0: 1505; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1506; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1507; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1508; GFX10-NEXT: v_mov_b32_e32 v0, s0 1509; GFX10-NEXT: v_mov_b32_e32 v1, s1 1510; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 1511; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 1512; GFX10-NEXT: v_mov_b32_e32 v2, 42 1513; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, 20 1514; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 1515; GFX10-NEXT: flat_atomic_inc v0, v[0:1], v2 glc 1516; GFX10-NEXT: s_endpgm 1517 %id = call i32 @llvm.amdgcn.workitem.id.x() 1518 %gep.tid = getelementptr i32, i32* %ptr, i32 %id 1519 %gep = getelementptr i32, i32* %gep.tid, i32 5 1520 %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0i32(i32* %gep, i32 42, i32 0, i32 0, i1 false) 1521 ret void 1522} 1523 1524@lds1 = internal addrspace(3) global [512 x i64] undef, align 8 1525 1526define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { 1527; CI-LABEL: atomic_inc_shl_base_lds_0_i64: 1528; CI: ; %bb.0: 1529; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1530; CI-NEXT: v_add_i32_e32 v4, vcc, 2, v0 1531; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1532; CI-NEXT: v_mov_b32_e32 v0, 9 1533; CI-NEXT: v_mov_b32_e32 v1, 0 1534; CI-NEXT: s_mov_b32 m0, -1 1535; CI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:16 1536; CI-NEXT: s_waitcnt lgkmcnt(0) 1537; CI-NEXT: v_mov_b32_e32 v2, s2 1538; CI-NEXT: v_mov_b32_e32 v3, s3 1539; CI-NEXT: flat_store_dword v[2:3], v4 1540; CI-NEXT: v_mov_b32_e32 v3, s1 1541; CI-NEXT: v_mov_b32_e32 v2, s0 1542; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1543; CI-NEXT: s_endpgm 1544; 1545; VI-LABEL: atomic_inc_shl_base_lds_0_i64: 1546; VI: ; %bb.0: 1547; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1548; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v0 1549; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1550; VI-NEXT: v_mov_b32_e32 v0, 9 1551; VI-NEXT: v_mov_b32_e32 v1, 0 1552; VI-NEXT: s_mov_b32 m0, -1 1553; VI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:16 1554; VI-NEXT: s_waitcnt lgkmcnt(0) 1555; VI-NEXT: v_mov_b32_e32 v2, s2 1556; VI-NEXT: v_mov_b32_e32 v3, s3 1557; VI-NEXT: flat_store_dword v[2:3], v4 1558; VI-NEXT: v_mov_b32_e32 v3, s1 1559; VI-NEXT: v_mov_b32_e32 v2, s0 1560; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1561; VI-NEXT: s_endpgm 1562; 1563; GFX9-LABEL: atomic_inc_shl_base_lds_0_i64: 1564; GFX9: ; %bb.0: 1565; GFX9-NEXT: v_mov_b32_e32 v1, 9 1566; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1567; GFX9-NEXT: v_add_u32_e32 v3, 2, v0 1568; GFX9-NEXT: v_mov_b32_e32 v2, 0 1569; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 1570; GFX9-NEXT: ds_inc_rtn_u64 v[0:1], v0, v[1:2] offset:16 1571; GFX9-NEXT: v_mov_b32_e32 v2, 0 1572; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1573; GFX9-NEXT: global_store_dword v2, v3, s[2:3] 1574; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1575; GFX9-NEXT: s_endpgm 1576; 1577; GFX10-LABEL: atomic_inc_shl_base_lds_0_i64: 1578; GFX10: ; %bb.0: 1579; GFX10-NEXT: v_mov_b32_e32 v1, 9 1580; GFX10-NEXT: v_mov_b32_e32 v2, 0 1581; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0 1582; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1583; GFX10-NEXT: v_add_nc_u32_e32 v0, 2, v0 1584; GFX10-NEXT: ds_inc_rtn_u64 v[1:2], v3, v[1:2] offset:16 1585; GFX10-NEXT: v_mov_b32_e32 v3, 0 1586; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1587; GFX10-NEXT: global_store_dword v3, v0, s[2:3] 1588; GFX10-NEXT: global_store_dwordx2 v3, v[1:2], s[0:1] 1589; GFX10-NEXT: s_endpgm 1590 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 1591 %idx.0 = add nsw i32 %tid.x, 2 1592 %arrayidx0 = getelementptr inbounds [512 x i64], [512 x i64] addrspace(3)* @lds1, i32 0, i32 %idx.0 1593 %val0 = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %arrayidx0, i64 9, i32 0, i32 0, i1 false) 1594 store i32 %idx.0, i32 addrspace(1)* %add_use 1595 store i64 %val0, i64 addrspace(1)* %out 1596 ret void 1597} 1598 1599define amdgpu_kernel void @flat_atomic_inc_ret_i64(i64* %out, i64* %ptr) #0 { 1600; GCN-LABEL: flat_atomic_inc_ret_i64: 1601; GCN: ; %bb.0: 1602; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1603; GCN-NEXT: v_mov_b32_e32 v0, 42 1604; GCN-NEXT: v_mov_b32_e32 v1, 0 1605; GCN-NEXT: s_waitcnt lgkmcnt(0) 1606; GCN-NEXT: v_mov_b32_e32 v2, s2 1607; GCN-NEXT: v_mov_b32_e32 v3, s3 1608; GCN-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc 1609; GCN-NEXT: v_mov_b32_e32 v3, s1 1610; GCN-NEXT: v_mov_b32_e32 v2, s0 1611; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1612; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1613; GCN-NEXT: s_endpgm 1614 %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0i64(i64* %ptr, i64 42, i32 0, i32 0, i1 false) 1615 store i64 %result, i64* %out 1616 ret void 1617} 1618 1619define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(i64* %out, i64* %ptr) #0 { 1620; CI-LABEL: flat_atomic_inc_ret_i64_offset: 1621; CI: ; %bb.0: 1622; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1623; CI-NEXT: v_mov_b32_e32 v0, 42 1624; CI-NEXT: v_mov_b32_e32 v1, 0 1625; CI-NEXT: s_waitcnt lgkmcnt(0) 1626; CI-NEXT: s_add_u32 s2, s2, 32 1627; CI-NEXT: s_addc_u32 s3, s3, 0 1628; CI-NEXT: v_mov_b32_e32 v2, s2 1629; CI-NEXT: v_mov_b32_e32 v3, s3 1630; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc 1631; CI-NEXT: v_mov_b32_e32 v3, s1 1632; CI-NEXT: v_mov_b32_e32 v2, s0 1633; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1634; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1635; CI-NEXT: s_endpgm 1636; 1637; VI-LABEL: flat_atomic_inc_ret_i64_offset: 1638; VI: ; %bb.0: 1639; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1640; VI-NEXT: v_mov_b32_e32 v0, 42 1641; VI-NEXT: v_mov_b32_e32 v1, 0 1642; VI-NEXT: s_waitcnt lgkmcnt(0) 1643; VI-NEXT: s_add_u32 s2, s2, 32 1644; VI-NEXT: s_addc_u32 s3, s3, 0 1645; VI-NEXT: v_mov_b32_e32 v2, s2 1646; VI-NEXT: v_mov_b32_e32 v3, s3 1647; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc 1648; VI-NEXT: v_mov_b32_e32 v3, s1 1649; VI-NEXT: v_mov_b32_e32 v2, s0 1650; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1651; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1652; VI-NEXT: s_endpgm 1653; 1654; GFX9-LABEL: flat_atomic_inc_ret_i64_offset: 1655; GFX9: ; %bb.0: 1656; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1657; GFX9-NEXT: v_mov_b32_e32 v0, 42 1658; GFX9-NEXT: v_mov_b32_e32 v1, 0 1659; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1660; GFX9-NEXT: v_mov_b32_e32 v2, s2 1661; GFX9-NEXT: v_mov_b32_e32 v3, s3 1662; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] offset:32 glc 1663; GFX9-NEXT: v_mov_b32_e32 v3, s1 1664; GFX9-NEXT: v_mov_b32_e32 v2, s0 1665; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1666; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1667; GFX9-NEXT: s_endpgm 1668; 1669; GFX10-LABEL: flat_atomic_inc_ret_i64_offset: 1670; GFX10: ; %bb.0: 1671; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1672; GFX10-NEXT: v_mov_b32_e32 v0, 42 1673; GFX10-NEXT: v_mov_b32_e32 v1, 0 1674; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1675; GFX10-NEXT: s_add_u32 s2, s2, 32 1676; GFX10-NEXT: s_addc_u32 s3, s3, 0 1677; GFX10-NEXT: v_mov_b32_e32 v2, s2 1678; GFX10-NEXT: v_mov_b32_e32 v3, s3 1679; GFX10-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc 1680; GFX10-NEXT: v_mov_b32_e32 v3, s1 1681; GFX10-NEXT: v_mov_b32_e32 v2, s0 1682; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1683; GFX10-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1684; GFX10-NEXT: s_endpgm 1685 %gep = getelementptr i64, i64* %ptr, i32 4 1686 %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0i64(i64* %gep, i64 42, i32 0, i32 0, i1 false) 1687 store i64 %result, i64* %out 1688 ret void 1689} 1690 1691define amdgpu_kernel void @flat_atomic_inc_noret_i64(i64* %ptr) nounwind { 1692; GCN-LABEL: flat_atomic_inc_noret_i64: 1693; GCN: ; %bb.0: 1694; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1695; GCN-NEXT: v_mov_b32_e32 v0, 42 1696; GCN-NEXT: v_mov_b32_e32 v1, 0 1697; GCN-NEXT: s_waitcnt lgkmcnt(0) 1698; GCN-NEXT: v_mov_b32_e32 v3, s1 1699; GCN-NEXT: v_mov_b32_e32 v2, s0 1700; GCN-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc 1701; GCN-NEXT: s_endpgm 1702 %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0i64(i64* %ptr, i64 42, i32 0, i32 0, i1 false) 1703 ret void 1704} 1705 1706define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(i64* %ptr) nounwind { 1707; CI-LABEL: flat_atomic_inc_noret_i64_offset: 1708; CI: ; %bb.0: 1709; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1710; CI-NEXT: v_mov_b32_e32 v0, 42 1711; CI-NEXT: v_mov_b32_e32 v1, 0 1712; CI-NEXT: s_waitcnt lgkmcnt(0) 1713; CI-NEXT: s_add_u32 s0, s0, 32 1714; CI-NEXT: s_addc_u32 s1, s1, 0 1715; CI-NEXT: v_mov_b32_e32 v3, s1 1716; CI-NEXT: v_mov_b32_e32 v2, s0 1717; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc 1718; CI-NEXT: s_endpgm 1719; 1720; VI-LABEL: flat_atomic_inc_noret_i64_offset: 1721; VI: ; %bb.0: 1722; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1723; VI-NEXT: v_mov_b32_e32 v0, 42 1724; VI-NEXT: v_mov_b32_e32 v1, 0 1725; VI-NEXT: s_waitcnt lgkmcnt(0) 1726; VI-NEXT: s_add_u32 s0, s0, 32 1727; VI-NEXT: s_addc_u32 s1, s1, 0 1728; VI-NEXT: v_mov_b32_e32 v3, s1 1729; VI-NEXT: v_mov_b32_e32 v2, s0 1730; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc 1731; VI-NEXT: s_endpgm 1732; 1733; GFX9-LABEL: flat_atomic_inc_noret_i64_offset: 1734; GFX9: ; %bb.0: 1735; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1736; GFX9-NEXT: v_mov_b32_e32 v0, 42 1737; GFX9-NEXT: v_mov_b32_e32 v1, 0 1738; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1739; GFX9-NEXT: v_mov_b32_e32 v3, s1 1740; GFX9-NEXT: v_mov_b32_e32 v2, s0 1741; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] offset:32 glc 1742; GFX9-NEXT: s_endpgm 1743; 1744; GFX10-LABEL: flat_atomic_inc_noret_i64_offset: 1745; GFX10: ; %bb.0: 1746; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1747; GFX10-NEXT: v_mov_b32_e32 v0, 42 1748; GFX10-NEXT: v_mov_b32_e32 v1, 0 1749; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1750; GFX10-NEXT: s_add_u32 s0, s0, 32 1751; GFX10-NEXT: s_addc_u32 s1, s1, 0 1752; GFX10-NEXT: v_mov_b32_e32 v3, s1 1753; GFX10-NEXT: v_mov_b32_e32 v2, s0 1754; GFX10-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc 1755; GFX10-NEXT: s_endpgm 1756 %gep = getelementptr i64, i64* %ptr, i32 4 1757 %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0i64(i64* %gep, i64 42, i32 0, i32 0, i1 false) 1758 ret void 1759} 1760 1761define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(i64* %out, i64* %ptr) #0 { 1762; CI-LABEL: flat_atomic_inc_ret_i64_offset_addr64: 1763; CI: ; %bb.0: 1764; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1765; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1766; CI-NEXT: s_waitcnt lgkmcnt(0) 1767; CI-NEXT: v_mov_b32_e32 v0, s2 1768; CI-NEXT: v_mov_b32_e32 v1, s3 1769; CI-NEXT: v_add_i32_e32 v4, vcc, v0, v2 1770; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 1771; CI-NEXT: v_mov_b32_e32 v0, s0 1772; CI-NEXT: v_mov_b32_e32 v1, s1 1773; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 1774; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1775; CI-NEXT: v_mov_b32_e32 v2, 42 1776; CI-NEXT: v_add_i32_e32 v4, vcc, 40, v4 1777; CI-NEXT: v_mov_b32_e32 v3, 0 1778; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc 1779; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[4:5], v[2:3] glc 1780; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1781; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 1782; CI-NEXT: s_endpgm 1783; 1784; VI-LABEL: flat_atomic_inc_ret_i64_offset_addr64: 1785; VI: ; %bb.0: 1786; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1787; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1788; VI-NEXT: s_waitcnt lgkmcnt(0) 1789; VI-NEXT: v_mov_b32_e32 v0, s2 1790; VI-NEXT: v_mov_b32_e32 v1, s3 1791; VI-NEXT: v_add_u32_e32 v4, vcc, v0, v2 1792; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 1793; VI-NEXT: v_mov_b32_e32 v0, s0 1794; VI-NEXT: v_mov_b32_e32 v1, s1 1795; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 1796; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1797; VI-NEXT: v_mov_b32_e32 v2, 42 1798; VI-NEXT: v_add_u32_e32 v4, vcc, 40, v4 1799; VI-NEXT: v_mov_b32_e32 v3, 0 1800; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc 1801; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[4:5], v[2:3] glc 1802; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1803; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 1804; VI-NEXT: s_endpgm 1805; 1806; GFX9-LABEL: flat_atomic_inc_ret_i64_offset_addr64: 1807; GFX9: ; %bb.0: 1808; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1809; GFX9-NEXT: v_lshlrev_b32_e32 v6, 3, v0 1810; GFX9-NEXT: v_mov_b32_e32 v4, 42 1811; GFX9-NEXT: v_mov_b32_e32 v5, 0 1812; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1813; GFX9-NEXT: v_mov_b32_e32 v0, s2 1814; GFX9-NEXT: v_mov_b32_e32 v1, s3 1815; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v6 1816; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 1817; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[4:5] offset:40 glc 1818; GFX9-NEXT: v_mov_b32_e32 v3, s1 1819; GFX9-NEXT: v_mov_b32_e32 v2, s0 1820; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 1821; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 1822; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1823; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1824; GFX9-NEXT: s_endpgm 1825; 1826; GFX10-LABEL: flat_atomic_inc_ret_i64_offset_addr64: 1827; GFX10: ; %bb.0: 1828; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1829; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 1830; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1831; GFX10-NEXT: v_mov_b32_e32 v0, s2 1832; GFX10-NEXT: v_mov_b32_e32 v1, s3 1833; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v0, v4 1834; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo 1835; GFX10-NEXT: v_mov_b32_e32 v0, 42 1836; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, 40 1837; GFX10-NEXT: v_mov_b32_e32 v1, 0 1838; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo 1839; GFX10-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc 1840; GFX10-NEXT: v_mov_b32_e32 v3, s1 1841; GFX10-NEXT: v_mov_b32_e32 v2, s0 1842; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 1843; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo 1844; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1845; GFX10-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1846; GFX10-NEXT: s_endpgm 1847 %id = call i32 @llvm.amdgcn.workitem.id.x() 1848 %gep.tid = getelementptr i64, i64* %ptr, i32 %id 1849 %out.gep = getelementptr i64, i64* %out, i32 %id 1850 %gep = getelementptr i64, i64* %gep.tid, i32 5 1851 %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0i64(i64* %gep, i64 42, i32 0, i32 0, i1 false) 1852 store i64 %result, i64* %out.gep 1853 ret void 1854} 1855 1856define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(i64* %ptr) #0 { 1857; CI-LABEL: flat_atomic_inc_noret_i64_offset_addr64: 1858; CI: ; %bb.0: 1859; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1860; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1861; CI-NEXT: s_waitcnt lgkmcnt(0) 1862; CI-NEXT: v_mov_b32_e32 v0, s0 1863; CI-NEXT: v_mov_b32_e32 v1, s1 1864; CI-NEXT: v_add_i32_e32 v2, vcc, v0, v2 1865; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 1866; CI-NEXT: v_mov_b32_e32 v0, 42 1867; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v2 1868; CI-NEXT: v_mov_b32_e32 v1, 0 1869; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1870; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc 1871; CI-NEXT: s_endpgm 1872; 1873; VI-LABEL: flat_atomic_inc_noret_i64_offset_addr64: 1874; VI: ; %bb.0: 1875; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1876; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1877; VI-NEXT: s_waitcnt lgkmcnt(0) 1878; VI-NEXT: v_mov_b32_e32 v0, s0 1879; VI-NEXT: v_mov_b32_e32 v1, s1 1880; VI-NEXT: v_add_u32_e32 v2, vcc, v0, v2 1881; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 1882; VI-NEXT: v_mov_b32_e32 v0, 42 1883; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v2 1884; VI-NEXT: v_mov_b32_e32 v1, 0 1885; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1886; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc 1887; VI-NEXT: s_endpgm 1888; 1889; GFX9-LABEL: flat_atomic_inc_noret_i64_offset_addr64: 1890; GFX9: ; %bb.0: 1891; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1892; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 1893; GFX9-NEXT: v_mov_b32_e32 v1, 42 1894; GFX9-NEXT: v_mov_b32_e32 v2, 0 1895; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1896; GFX9-NEXT: v_mov_b32_e32 v4, s1 1897; GFX9-NEXT: v_mov_b32_e32 v3, s0 1898; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0 1899; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc 1900; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[3:4], v[1:2] offset:40 glc 1901; GFX9-NEXT: s_endpgm 1902; 1903; GFX10-LABEL: flat_atomic_inc_noret_i64_offset_addr64: 1904; GFX10: ; %bb.0: 1905; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1906; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1907; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1908; GFX10-NEXT: v_mov_b32_e32 v0, s0 1909; GFX10-NEXT: v_mov_b32_e32 v1, s1 1910; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v0, v2 1911; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo 1912; GFX10-NEXT: v_mov_b32_e32 v0, 42 1913; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, 40 1914; GFX10-NEXT: v_mov_b32_e32 v1, 0 1915; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo 1916; GFX10-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc 1917; GFX10-NEXT: s_endpgm 1918 %id = call i32 @llvm.amdgcn.workitem.id.x() 1919 %gep.tid = getelementptr i64, i64* %ptr, i32 %id 1920 %gep = getelementptr i64, i64* %gep.tid, i32 5 1921 %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0i64(i64* %gep, i64 42, i32 0, i32 0, i1 false) 1922 ret void 1923} 1924 1925define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(3)* %ptr) #0 { 1926; CI-LABEL: nocse_lds_atomic_inc_ret_i32: 1927; CI: ; %bb.0: 1928; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1929; CI-NEXT: s_load_dword s4, s[4:5], 0x4 1930; CI-NEXT: v_mov_b32_e32 v0, 42 1931; CI-NEXT: s_mov_b32 m0, -1 1932; CI-NEXT: s_waitcnt lgkmcnt(0) 1933; CI-NEXT: v_mov_b32_e32 v2, s2 1934; CI-NEXT: v_mov_b32_e32 v1, s4 1935; CI-NEXT: ds_inc_rtn_u32 v4, v1, v0 1936; CI-NEXT: ds_inc_rtn_u32 v5, v1, v0 1937; CI-NEXT: v_mov_b32_e32 v0, s0 1938; CI-NEXT: v_mov_b32_e32 v1, s1 1939; CI-NEXT: v_mov_b32_e32 v3, s3 1940; CI-NEXT: s_waitcnt lgkmcnt(1) 1941; CI-NEXT: flat_store_dword v[0:1], v4 1942; CI-NEXT: s_waitcnt lgkmcnt(0) 1943; CI-NEXT: flat_store_dword v[2:3], v5 1944; CI-NEXT: s_endpgm 1945; 1946; VI-LABEL: nocse_lds_atomic_inc_ret_i32: 1947; VI: ; %bb.0: 1948; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1949; VI-NEXT: s_load_dword s4, s[4:5], 0x10 1950; VI-NEXT: v_mov_b32_e32 v0, 42 1951; VI-NEXT: s_mov_b32 m0, -1 1952; VI-NEXT: s_waitcnt lgkmcnt(0) 1953; VI-NEXT: v_mov_b32_e32 v2, s2 1954; VI-NEXT: v_mov_b32_e32 v1, s4 1955; VI-NEXT: ds_inc_rtn_u32 v4, v1, v0 1956; VI-NEXT: ds_inc_rtn_u32 v5, v1, v0 1957; VI-NEXT: v_mov_b32_e32 v0, s0 1958; VI-NEXT: v_mov_b32_e32 v1, s1 1959; VI-NEXT: v_mov_b32_e32 v3, s3 1960; VI-NEXT: s_waitcnt lgkmcnt(1) 1961; VI-NEXT: flat_store_dword v[0:1], v4 1962; VI-NEXT: s_waitcnt lgkmcnt(0) 1963; VI-NEXT: flat_store_dword v[2:3], v5 1964; VI-NEXT: s_endpgm 1965; 1966; GFX9-LABEL: nocse_lds_atomic_inc_ret_i32: 1967; GFX9: ; %bb.0: 1968; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1969; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 1970; GFX9-NEXT: v_mov_b32_e32 v0, 42 1971; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1972; GFX9-NEXT: v_mov_b32_e32 v1, s6 1973; GFX9-NEXT: ds_inc_rtn_u32 v2, v1, v0 1974; GFX9-NEXT: ds_inc_rtn_u32 v0, v1, v0 1975; GFX9-NEXT: v_mov_b32_e32 v1, 0 1976; GFX9-NEXT: s_waitcnt lgkmcnt(1) 1977; GFX9-NEXT: global_store_dword v1, v2, s[0:1] 1978; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1979; GFX9-NEXT: global_store_dword v1, v0, s[2:3] 1980; GFX9-NEXT: s_endpgm 1981; 1982; GFX10-LABEL: nocse_lds_atomic_inc_ret_i32: 1983; GFX10: ; %bb.0: 1984; GFX10-NEXT: s_load_dword s0, s[4:5], 0x10 1985; GFX10-NEXT: v_mov_b32_e32 v0, 42 1986; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1987; GFX10-NEXT: v_mov_b32_e32 v1, s0 1988; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1989; GFX10-NEXT: ds_inc_rtn_u32 v2, v1, v0 1990; GFX10-NEXT: ds_inc_rtn_u32 v0, v1, v0 1991; GFX10-NEXT: v_mov_b32_e32 v1, 0 1992; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1993; GFX10-NEXT: global_store_dword v1, v2, s[0:1] 1994; GFX10-NEXT: global_store_dword v1, v0, s[2:3] 1995; GFX10-NEXT: s_endpgm 1996 %result0 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false) 1997 %result1 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false) 1998 1999 store i32 %result0, i32 addrspace(1)* %out0 2000 store i32 %result1, i32 addrspace(1)* %out1 2001 ret void 2002} 2003 2004attributes #0 = { nounwind } 2005attributes #1 = { nounwind readnone } 2006attributes #2 = { nounwind argmemonly } 2007