1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX900 %s 3; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908 %s 4; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX90A %s 5; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s 6 7define amdgpu_kernel void @global_atomic_fadd_ret_f32(float addrspace(1)* %ptr) #0 { 8; GFX900-LABEL: global_atomic_fadd_ret_f32: 9; GFX900: ; %bb.0: 10; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 11; GFX900-NEXT: s_mov_b64 s[2:3], 0 12; GFX900-NEXT: s_waitcnt lgkmcnt(0) 13; GFX900-NEXT: s_load_dword s4, s[0:1], 0x0 14; GFX900-NEXT: s_waitcnt lgkmcnt(0) 15; GFX900-NEXT: v_mov_b32_e32 v0, s4 16; GFX900-NEXT: BB0_1: ; %atomicrmw.start 17; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 18; GFX900-NEXT: v_mov_b32_e32 v1, v0 19; GFX900-NEXT: v_mov_b32_e32 v2, 0 20; GFX900-NEXT: v_add_f32_e32 v0, 4.0, v1 21; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 22; GFX900-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc 23; GFX900-NEXT: s_waitcnt vmcnt(0) 24; GFX900-NEXT: buffer_wbinvl1_vol 25; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 26; GFX900-NEXT: s_or_b64 s[2:3], vcc, s[2:3] 27; GFX900-NEXT: s_andn2_b64 exec, exec, s[2:3] 28; GFX900-NEXT: s_cbranch_execnz BB0_1 29; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end 30; GFX900-NEXT: s_or_b64 exec, exec, s[2:3] 31; GFX900-NEXT: global_store_dword v[0:1], v0, off 32; GFX900-NEXT: s_endpgm 33; 34; GFX908-LABEL: global_atomic_fadd_ret_f32: 35; GFX908: ; %bb.0: 36; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 37; GFX908-NEXT: s_mov_b64 s[2:3], 0 38; GFX908-NEXT: s_waitcnt lgkmcnt(0) 39; GFX908-NEXT: s_load_dword s4, s[0:1], 0x0 40; GFX908-NEXT: s_waitcnt lgkmcnt(0) 41; GFX908-NEXT: v_mov_b32_e32 v0, s4 42; GFX908-NEXT: BB0_1: ; %atomicrmw.start 43; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 44; GFX908-NEXT: v_mov_b32_e32 v1, v0 45; GFX908-NEXT: v_mov_b32_e32 v2, 0 46; GFX908-NEXT: v_add_f32_e32 v0, 4.0, v1 47; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 48; GFX908-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc 49; GFX908-NEXT: s_waitcnt vmcnt(0) 50; GFX908-NEXT: buffer_wbinvl1_vol 51; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 52; GFX908-NEXT: s_or_b64 s[2:3], vcc, s[2:3] 53; GFX908-NEXT: s_andn2_b64 exec, exec, s[2:3] 54; GFX908-NEXT: s_cbranch_execnz BB0_1 55; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 56; GFX908-NEXT: s_or_b64 exec, exec, s[2:3] 57; GFX908-NEXT: global_store_dword v[0:1], v0, off 58; GFX908-NEXT: s_endpgm 59; 60; GFX90A-LABEL: global_atomic_fadd_ret_f32: 61; GFX90A: ; %bb.0: 62; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 63; GFX90A-NEXT: s_mov_b64 s[2:3], 0 64; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 65; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x0 66; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 67; GFX90A-NEXT: v_mov_b32_e32 v0, s4 68; GFX90A-NEXT: BB0_1: ; %atomicrmw.start 69; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 70; GFX90A-NEXT: v_mov_b32_e32 v1, v0 71; GFX90A-NEXT: v_mov_b32_e32 v2, 0 72; GFX90A-NEXT: v_add_f32_e32 v0, 4.0, v1 73; GFX90A-NEXT: buffer_wbl2 74; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 75; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc 76; GFX90A-NEXT: s_waitcnt vmcnt(0) 77; GFX90A-NEXT: buffer_invl2 78; GFX90A-NEXT: buffer_wbinvl1_vol 79; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 80; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] 81; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3] 82; GFX90A-NEXT: s_cbranch_execnz BB0_1 83; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 84; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3] 85; GFX90A-NEXT: global_store_dword v[0:1], v0, off 86; GFX90A-NEXT: s_endpgm 87; 88; GFX10-LABEL: global_atomic_fadd_ret_f32: 89; GFX10: ; %bb.0: 90; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 91; GFX10-NEXT: s_waitcnt lgkmcnt(0) 92; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 93; GFX10-NEXT: s_waitcnt lgkmcnt(0) 94; GFX10-NEXT: v_mov_b32_e32 v0, s2 95; GFX10-NEXT: s_mov_b32 s2, 0 96; GFX10-NEXT: BB0_1: ; %atomicrmw.start 97; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 98; GFX10-NEXT: v_mov_b32_e32 v1, v0 99; GFX10-NEXT: v_mov_b32_e32 v2, 0 100; GFX10-NEXT: v_add_f32_e32 v0, 4.0, v1 101; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 102; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 103; GFX10-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc 104; GFX10-NEXT: s_waitcnt vmcnt(0) 105; GFX10-NEXT: buffer_gl0_inv 106; GFX10-NEXT: buffer_gl1_inv 107; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 108; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2 109; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 110; GFX10-NEXT: s_cbranch_execnz BB0_1 111; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 112; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 113; GFX10-NEXT: global_store_dword v[0:1], v0, off 114; GFX10-NEXT: s_endpgm 115 %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 seq_cst 116 store float %result, float addrspace(1)* undef 117 ret void 118} 119 120define amdgpu_kernel void @global_atomic_fadd_ret_f32_ieee(float addrspace(1)* %ptr) #2 { 121; GFX900-LABEL: global_atomic_fadd_ret_f32_ieee: 122; GFX900: ; %bb.0: 123; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 124; GFX900-NEXT: s_mov_b64 s[2:3], 0 125; GFX900-NEXT: s_waitcnt lgkmcnt(0) 126; GFX900-NEXT: s_load_dword s4, s[0:1], 0x0 127; GFX900-NEXT: s_waitcnt lgkmcnt(0) 128; GFX900-NEXT: v_mov_b32_e32 v0, s4 129; GFX900-NEXT: BB1_1: ; %atomicrmw.start 130; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 131; GFX900-NEXT: v_mov_b32_e32 v1, v0 132; GFX900-NEXT: v_mov_b32_e32 v2, 0 133; GFX900-NEXT: v_add_f32_e32 v0, 4.0, v1 134; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 135; GFX900-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc 136; GFX900-NEXT: s_waitcnt vmcnt(0) 137; GFX900-NEXT: buffer_wbinvl1_vol 138; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 139; GFX900-NEXT: s_or_b64 s[2:3], vcc, s[2:3] 140; GFX900-NEXT: s_andn2_b64 exec, exec, s[2:3] 141; GFX900-NEXT: s_cbranch_execnz BB1_1 142; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end 143; GFX900-NEXT: s_or_b64 exec, exec, s[2:3] 144; GFX900-NEXT: global_store_dword v[0:1], v0, off 145; GFX900-NEXT: s_endpgm 146; 147; GFX908-LABEL: global_atomic_fadd_ret_f32_ieee: 148; GFX908: ; %bb.0: 149; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 150; GFX908-NEXT: s_mov_b64 s[2:3], 0 151; GFX908-NEXT: s_waitcnt lgkmcnt(0) 152; GFX908-NEXT: s_load_dword s4, s[0:1], 0x0 153; GFX908-NEXT: s_waitcnt lgkmcnt(0) 154; GFX908-NEXT: v_mov_b32_e32 v0, s4 155; GFX908-NEXT: BB1_1: ; %atomicrmw.start 156; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 157; GFX908-NEXT: v_mov_b32_e32 v1, v0 158; GFX908-NEXT: v_mov_b32_e32 v2, 0 159; GFX908-NEXT: v_add_f32_e32 v0, 4.0, v1 160; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 161; GFX908-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc 162; GFX908-NEXT: s_waitcnt vmcnt(0) 163; GFX908-NEXT: buffer_wbinvl1_vol 164; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 165; GFX908-NEXT: s_or_b64 s[2:3], vcc, s[2:3] 166; GFX908-NEXT: s_andn2_b64 exec, exec, s[2:3] 167; GFX908-NEXT: s_cbranch_execnz BB1_1 168; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 169; GFX908-NEXT: s_or_b64 exec, exec, s[2:3] 170; GFX908-NEXT: global_store_dword v[0:1], v0, off 171; GFX908-NEXT: s_endpgm 172; 173; GFX90A-LABEL: global_atomic_fadd_ret_f32_ieee: 174; GFX90A: ; %bb.0: 175; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 176; GFX90A-NEXT: v_mov_b32_e32 v0, 0 177; GFX90A-NEXT: v_mov_b32_e32 v1, 4.0 178; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 179; GFX90A-NEXT: global_atomic_add_f32 v0, v0, v1, s[0:1] glc 180; GFX90A-NEXT: s_waitcnt vmcnt(0) 181; GFX90A-NEXT: buffer_wbinvl1_vol 182; GFX90A-NEXT: global_store_dword v[0:1], v0, off 183; GFX90A-NEXT: s_endpgm 184; 185; GFX10-LABEL: global_atomic_fadd_ret_f32_ieee: 186; GFX10: ; %bb.0: 187; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 188; GFX10-NEXT: s_waitcnt lgkmcnt(0) 189; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 190; GFX10-NEXT: s_waitcnt lgkmcnt(0) 191; GFX10-NEXT: v_mov_b32_e32 v0, s2 192; GFX10-NEXT: s_mov_b32 s2, 0 193; GFX10-NEXT: BB1_1: ; %atomicrmw.start 194; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 195; GFX10-NEXT: v_mov_b32_e32 v1, v0 196; GFX10-NEXT: v_mov_b32_e32 v2, 0 197; GFX10-NEXT: v_add_f32_e32 v0, 4.0, v1 198; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 199; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 200; GFX10-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc 201; GFX10-NEXT: s_waitcnt vmcnt(0) 202; GFX10-NEXT: buffer_gl0_inv 203; GFX10-NEXT: buffer_gl1_inv 204; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 205; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2 206; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 207; GFX10-NEXT: s_cbranch_execnz BB1_1 208; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 209; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 210; GFX10-NEXT: global_store_dword v[0:1], v0, off 211; GFX10-NEXT: s_endpgm 212 %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst 213 store float %result, float addrspace(1)* undef 214 ret void 215} 216 217define amdgpu_kernel void @global_atomic_fadd_noret_f32(float addrspace(1)* %ptr) #0 { 218; GFX900-LABEL: global_atomic_fadd_noret_f32: 219; GFX900: ; %bb.0: 220; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 221; GFX900-NEXT: s_mov_b64 s[2:3], 0 222; GFX900-NEXT: s_waitcnt lgkmcnt(0) 223; GFX900-NEXT: s_load_dword s4, s[0:1], 0x0 224; GFX900-NEXT: s_waitcnt lgkmcnt(0) 225; GFX900-NEXT: v_mov_b32_e32 v1, s4 226; GFX900-NEXT: BB2_1: ; %atomicrmw.start 227; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 228; GFX900-NEXT: v_mov_b32_e32 v2, 0 229; GFX900-NEXT: v_add_f32_e32 v0, 4.0, v1 230; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 231; GFX900-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc 232; GFX900-NEXT: s_waitcnt vmcnt(0) 233; GFX900-NEXT: buffer_wbinvl1_vol 234; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 235; GFX900-NEXT: s_or_b64 s[2:3], vcc, s[2:3] 236; GFX900-NEXT: v_mov_b32_e32 v1, v0 237; GFX900-NEXT: s_andn2_b64 exec, exec, s[2:3] 238; GFX900-NEXT: s_cbranch_execnz BB2_1 239; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end 240; GFX900-NEXT: s_endpgm 241; 242; GFX908-LABEL: global_atomic_fadd_noret_f32: 243; GFX908: ; %bb.0: 244; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 245; GFX908-NEXT: v_mov_b32_e32 v0, 0 246; GFX908-NEXT: v_mov_b32_e32 v1, 4.0 247; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 248; GFX908-NEXT: global_atomic_add_f32 v0, v1, s[0:1] 249; GFX908-NEXT: s_waitcnt vmcnt(0) 250; GFX908-NEXT: buffer_wbinvl1_vol 251; GFX908-NEXT: s_endpgm 252; 253; GFX90A-LABEL: global_atomic_fadd_noret_f32: 254; GFX90A: ; %bb.0: 255; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 256; GFX90A-NEXT: v_mov_b32_e32 v0, 0 257; GFX90A-NEXT: v_mov_b32_e32 v1, 4.0 258; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 259; GFX90A-NEXT: global_atomic_add_f32 v0, v1, s[0:1] 260; GFX90A-NEXT: s_waitcnt vmcnt(0) 261; GFX90A-NEXT: buffer_wbinvl1_vol 262; GFX90A-NEXT: s_endpgm 263; 264; GFX10-LABEL: global_atomic_fadd_noret_f32: 265; GFX10: ; %bb.0: 266; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 267; GFX10-NEXT: s_waitcnt lgkmcnt(0) 268; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 269; GFX10-NEXT: s_waitcnt lgkmcnt(0) 270; GFX10-NEXT: v_mov_b32_e32 v1, s2 271; GFX10-NEXT: s_mov_b32 s2, 0 272; GFX10-NEXT: BB2_1: ; %atomicrmw.start 273; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 274; GFX10-NEXT: v_mov_b32_e32 v2, 0 275; GFX10-NEXT: v_add_f32_e32 v0, 4.0, v1 276; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 277; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 278; GFX10-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc 279; GFX10-NEXT: s_waitcnt vmcnt(0) 280; GFX10-NEXT: buffer_gl0_inv 281; GFX10-NEXT: buffer_gl1_inv 282; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 283; GFX10-NEXT: v_mov_b32_e32 v1, v0 284; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2 285; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 286; GFX10-NEXT: s_cbranch_execnz BB2_1 287; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 288; GFX10-NEXT: s_endpgm 289 %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst 290 ret void 291} 292 293define amdgpu_kernel void @global_atomic_fadd_noret_f32_ieee(float addrspace(1)* %ptr) #2 { 294; GFX900-LABEL: global_atomic_fadd_noret_f32_ieee: 295; GFX900: ; %bb.0: 296; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 297; GFX900-NEXT: s_mov_b64 s[2:3], 0 298; GFX900-NEXT: s_waitcnt lgkmcnt(0) 299; GFX900-NEXT: s_load_dword s4, s[0:1], 0x0 300; GFX900-NEXT: s_waitcnt lgkmcnt(0) 301; GFX900-NEXT: v_mov_b32_e32 v1, s4 302; GFX900-NEXT: BB3_1: ; %atomicrmw.start 303; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 304; GFX900-NEXT: v_mov_b32_e32 v2, 0 305; GFX900-NEXT: v_add_f32_e32 v0, 4.0, v1 306; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 307; GFX900-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc 308; GFX900-NEXT: s_waitcnt vmcnt(0) 309; GFX900-NEXT: buffer_wbinvl1_vol 310; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 311; GFX900-NEXT: s_or_b64 s[2:3], vcc, s[2:3] 312; GFX900-NEXT: v_mov_b32_e32 v1, v0 313; GFX900-NEXT: s_andn2_b64 exec, exec, s[2:3] 314; GFX900-NEXT: s_cbranch_execnz BB3_1 315; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end 316; GFX900-NEXT: s_endpgm 317; 318; GFX908-LABEL: global_atomic_fadd_noret_f32_ieee: 319; GFX908: ; %bb.0: 320; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 321; GFX908-NEXT: v_mov_b32_e32 v0, 0 322; GFX908-NEXT: v_mov_b32_e32 v1, 4.0 323; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 324; GFX908-NEXT: global_atomic_add_f32 v0, v1, s[0:1] 325; GFX908-NEXT: s_waitcnt vmcnt(0) 326; GFX908-NEXT: buffer_wbinvl1_vol 327; GFX908-NEXT: s_endpgm 328; 329; GFX90A-LABEL: global_atomic_fadd_noret_f32_ieee: 330; GFX90A: ; %bb.0: 331; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 332; GFX90A-NEXT: v_mov_b32_e32 v0, 0 333; GFX90A-NEXT: v_mov_b32_e32 v1, 4.0 334; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 335; GFX90A-NEXT: global_atomic_add_f32 v0, v1, s[0:1] 336; GFX90A-NEXT: s_waitcnt vmcnt(0) 337; GFX90A-NEXT: buffer_wbinvl1_vol 338; GFX90A-NEXT: s_endpgm 339; 340; GFX10-LABEL: global_atomic_fadd_noret_f32_ieee: 341; GFX10: ; %bb.0: 342; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 343; GFX10-NEXT: s_waitcnt lgkmcnt(0) 344; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 345; GFX10-NEXT: s_waitcnt lgkmcnt(0) 346; GFX10-NEXT: v_mov_b32_e32 v1, s2 347; GFX10-NEXT: s_mov_b32 s2, 0 348; GFX10-NEXT: BB3_1: ; %atomicrmw.start 349; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 350; GFX10-NEXT: v_mov_b32_e32 v2, 0 351; GFX10-NEXT: v_add_f32_e32 v0, 4.0, v1 352; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 353; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 354; GFX10-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc 355; GFX10-NEXT: s_waitcnt vmcnt(0) 356; GFX10-NEXT: buffer_gl0_inv 357; GFX10-NEXT: buffer_gl1_inv 358; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 359; GFX10-NEXT: v_mov_b32_e32 v1, v0 360; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2 361; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 362; GFX10-NEXT: s_cbranch_execnz BB3_1 363; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 364; GFX10-NEXT: s_endpgm 365 %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst 366 ret void 367} 368 369define amdgpu_kernel void @global_atomic_fadd_ret_f32_agent(float addrspace(1)* %ptr) #0 { 370; GFX900-LABEL: global_atomic_fadd_ret_f32_agent: 371; GFX900: ; %bb.0: 372; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 373; GFX900-NEXT: s_mov_b64 s[2:3], 0 374; GFX900-NEXT: s_waitcnt lgkmcnt(0) 375; GFX900-NEXT: s_load_dword s4, s[0:1], 0x0 376; GFX900-NEXT: s_waitcnt lgkmcnt(0) 377; GFX900-NEXT: v_mov_b32_e32 v0, s4 378; GFX900-NEXT: BB4_1: ; %atomicrmw.start 379; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 380; GFX900-NEXT: v_mov_b32_e32 v1, v0 381; GFX900-NEXT: v_mov_b32_e32 v2, 0 382; GFX900-NEXT: v_add_f32_e32 v0, 4.0, v1 383; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 384; GFX900-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc 385; GFX900-NEXT: s_waitcnt vmcnt(0) 386; GFX900-NEXT: buffer_wbinvl1_vol 387; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 388; GFX900-NEXT: s_or_b64 s[2:3], vcc, s[2:3] 389; GFX900-NEXT: s_andn2_b64 exec, exec, s[2:3] 390; GFX900-NEXT: s_cbranch_execnz BB4_1 391; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end 392; GFX900-NEXT: s_or_b64 exec, exec, s[2:3] 393; GFX900-NEXT: global_store_dword v[0:1], v0, off 394; GFX900-NEXT: s_endpgm 395; 396; GFX908-LABEL: global_atomic_fadd_ret_f32_agent: 397; GFX908: ; %bb.0: 398; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 399; GFX908-NEXT: s_mov_b64 s[2:3], 0 400; GFX908-NEXT: s_waitcnt lgkmcnt(0) 401; GFX908-NEXT: s_load_dword s4, s[0:1], 0x0 402; GFX908-NEXT: s_waitcnt lgkmcnt(0) 403; GFX908-NEXT: v_mov_b32_e32 v0, s4 404; GFX908-NEXT: BB4_1: ; %atomicrmw.start 405; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 406; GFX908-NEXT: v_mov_b32_e32 v1, v0 407; GFX908-NEXT: v_mov_b32_e32 v2, 0 408; GFX908-NEXT: v_add_f32_e32 v0, 4.0, v1 409; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 410; GFX908-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc 411; GFX908-NEXT: s_waitcnt vmcnt(0) 412; GFX908-NEXT: buffer_wbinvl1_vol 413; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 414; GFX908-NEXT: s_or_b64 s[2:3], vcc, s[2:3] 415; GFX908-NEXT: s_andn2_b64 exec, exec, s[2:3] 416; GFX908-NEXT: s_cbranch_execnz BB4_1 417; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 418; GFX908-NEXT: s_or_b64 exec, exec, s[2:3] 419; GFX908-NEXT: global_store_dword v[0:1], v0, off 420; GFX908-NEXT: s_endpgm 421; 422; GFX90A-LABEL: global_atomic_fadd_ret_f32_agent: 423; GFX90A: ; %bb.0: 424; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 425; GFX90A-NEXT: v_mov_b32_e32 v0, 0 426; GFX90A-NEXT: v_mov_b32_e32 v1, 4.0 427; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 428; GFX90A-NEXT: global_atomic_add_f32 v0, v0, v1, s[0:1] glc 429; GFX90A-NEXT: s_waitcnt vmcnt(0) 430; GFX90A-NEXT: buffer_wbinvl1_vol 431; GFX90A-NEXT: global_store_dword v[0:1], v0, off 432; GFX90A-NEXT: s_endpgm 433; 434; GFX10-LABEL: global_atomic_fadd_ret_f32_agent: 435; GFX10: ; %bb.0: 436; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 437; GFX10-NEXT: s_waitcnt lgkmcnt(0) 438; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 439; GFX10-NEXT: s_waitcnt lgkmcnt(0) 440; GFX10-NEXT: v_mov_b32_e32 v0, s2 441; GFX10-NEXT: s_mov_b32 s2, 0 442; GFX10-NEXT: BB4_1: ; %atomicrmw.start 443; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 444; GFX10-NEXT: v_mov_b32_e32 v1, v0 445; GFX10-NEXT: v_mov_b32_e32 v2, 0 446; GFX10-NEXT: v_add_f32_e32 v0, 4.0, v1 447; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 448; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 449; GFX10-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc 450; GFX10-NEXT: s_waitcnt vmcnt(0) 451; GFX10-NEXT: buffer_gl0_inv 452; GFX10-NEXT: buffer_gl1_inv 453; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 454; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2 455; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 456; GFX10-NEXT: s_cbranch_execnz BB4_1 457; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 458; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 459; GFX10-NEXT: global_store_dword v[0:1], v0, off 460; GFX10-NEXT: s_endpgm 461 %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst 462 store float %result, float addrspace(1)* undef 463 ret void 464} 465 466define amdgpu_kernel void @global_atomic_fadd_ret_f32_system(float addrspace(1)* %ptr) #0 { 467; GFX900-LABEL: global_atomic_fadd_ret_f32_system: 468; GFX900: ; %bb.0: 469; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 470; GFX900-NEXT: s_mov_b64 s[2:3], 0 471; GFX900-NEXT: s_waitcnt lgkmcnt(0) 472; GFX900-NEXT: s_load_dword s4, s[0:1], 0x0 473; GFX900-NEXT: s_waitcnt lgkmcnt(0) 474; GFX900-NEXT: v_mov_b32_e32 v0, s4 475; GFX900-NEXT: BB5_1: ; %atomicrmw.start 476; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 477; GFX900-NEXT: v_mov_b32_e32 v1, v0 478; GFX900-NEXT: v_mov_b32_e32 v2, 0 479; GFX900-NEXT: v_add_f32_e32 v0, 4.0, v1 480; GFX900-NEXT: s_waitcnt vmcnt(0) 481; GFX900-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc 482; GFX900-NEXT: s_waitcnt vmcnt(0) 483; GFX900-NEXT: buffer_wbinvl1_vol 484; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 485; GFX900-NEXT: s_or_b64 s[2:3], vcc, s[2:3] 486; GFX900-NEXT: s_andn2_b64 exec, exec, s[2:3] 487; GFX900-NEXT: s_cbranch_execnz BB5_1 488; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end 489; GFX900-NEXT: s_or_b64 exec, exec, s[2:3] 490; GFX900-NEXT: global_store_dword v[0:1], v0, off 491; GFX900-NEXT: s_endpgm 492; 493; GFX908-LABEL: global_atomic_fadd_ret_f32_system: 494; GFX908: ; %bb.0: 495; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 496; GFX908-NEXT: s_mov_b64 s[2:3], 0 497; GFX908-NEXT: s_waitcnt lgkmcnt(0) 498; GFX908-NEXT: s_load_dword s4, s[0:1], 0x0 499; GFX908-NEXT: s_waitcnt lgkmcnt(0) 500; GFX908-NEXT: v_mov_b32_e32 v0, s4 501; GFX908-NEXT: BB5_1: ; %atomicrmw.start 502; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 503; GFX908-NEXT: v_mov_b32_e32 v1, v0 504; GFX908-NEXT: v_mov_b32_e32 v2, 0 505; GFX908-NEXT: v_add_f32_e32 v0, 4.0, v1 506; GFX908-NEXT: s_waitcnt vmcnt(0) 507; GFX908-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc 508; GFX908-NEXT: s_waitcnt vmcnt(0) 509; GFX908-NEXT: buffer_wbinvl1_vol 510; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 511; GFX908-NEXT: s_or_b64 s[2:3], vcc, s[2:3] 512; GFX908-NEXT: s_andn2_b64 exec, exec, s[2:3] 513; GFX908-NEXT: s_cbranch_execnz BB5_1 514; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 515; GFX908-NEXT: s_or_b64 exec, exec, s[2:3] 516; GFX908-NEXT: global_store_dword v[0:1], v0, off 517; GFX908-NEXT: s_endpgm 518; 519; GFX90A-LABEL: global_atomic_fadd_ret_f32_system: 520; GFX90A: ; %bb.0: 521; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 522; GFX90A-NEXT: s_mov_b64 s[2:3], 0 523; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 524; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x0 525; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 526; GFX90A-NEXT: v_mov_b32_e32 v0, s4 527; GFX90A-NEXT: BB5_1: ; %atomicrmw.start 528; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 529; GFX90A-NEXT: v_mov_b32_e32 v1, v0 530; GFX90A-NEXT: v_mov_b32_e32 v2, 0 531; GFX90A-NEXT: v_add_f32_e32 v0, 4.0, v1 532; GFX90A-NEXT: buffer_wbl2 533; GFX90A-NEXT: s_waitcnt vmcnt(0) 534; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc 535; GFX90A-NEXT: s_waitcnt vmcnt(0) 536; GFX90A-NEXT: buffer_invl2 537; GFX90A-NEXT: buffer_wbinvl1_vol 538; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 539; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] 540; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3] 541; GFX90A-NEXT: s_cbranch_execnz BB5_1 542; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 543; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3] 544; GFX90A-NEXT: global_store_dword v[0:1], v0, off 545; GFX90A-NEXT: s_endpgm 546; 547; GFX10-LABEL: global_atomic_fadd_ret_f32_system: 548; GFX10: ; %bb.0: 549; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 550; GFX10-NEXT: s_waitcnt lgkmcnt(0) 551; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 552; GFX10-NEXT: s_waitcnt lgkmcnt(0) 553; GFX10-NEXT: v_mov_b32_e32 v0, s2 554; GFX10-NEXT: s_mov_b32 s2, 0 555; GFX10-NEXT: BB5_1: ; %atomicrmw.start 556; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 557; GFX10-NEXT: v_mov_b32_e32 v1, v0 558; GFX10-NEXT: v_mov_b32_e32 v2, 0 559; GFX10-NEXT: v_add_f32_e32 v0, 4.0, v1 560; GFX10-NEXT: s_waitcnt vmcnt(0) 561; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 562; GFX10-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc 563; GFX10-NEXT: s_waitcnt vmcnt(0) 564; GFX10-NEXT: buffer_gl0_inv 565; GFX10-NEXT: buffer_gl1_inv 566; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 567; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2 568; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 569; GFX10-NEXT: s_cbranch_execnz BB5_1 570; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 571; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 572; GFX10-NEXT: global_store_dword v[0:1], v0, off 573; GFX10-NEXT: s_endpgm 574 %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("one-as") seq_cst 575 store float %result, float addrspace(1)* undef 576 ret void 577} 578 579define amdgpu_kernel void @global_atomic_fadd_ret_f32_wrong_subtarget(float addrspace(1)* %ptr) #1 { 580; GCN-LABEL: global_atomic_fadd_ret_f32_wrong_subtarget: 581; GCN: ; %bb.0: 582; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 583; GCN-NEXT: s_mov_b64 s[2:3], 0 584; GCN-NEXT: s_waitcnt lgkmcnt(0) 585; GCN-NEXT: s_load_dword s4, s[0:1], 0x0 586; GCN-NEXT: s_waitcnt lgkmcnt(0) 587; GCN-NEXT: v_mov_b32_e32 v0, s4 588; GCN-NEXT: BB6_1: ; %atomicrmw.start 589; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 590; GCN-NEXT: v_mov_b32_e32 v1, v0 591; GCN-NEXT: v_mov_b32_e32 v2, 0 592; GCN-NEXT: v_add_f32_e32 v0, 4.0, v1 593; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 594; GCN-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc 595; GCN-NEXT: s_waitcnt vmcnt(0) 596; GCN-NEXT: buffer_wbinvl1_vol 597; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 598; GCN-NEXT: s_or_b64 s[2:3], vcc, s[2:3] 599; GCN-NEXT: s_andn2_b64 exec, exec, s[2:3] 600; GCN-NEXT: s_cbranch_execnz BB6_1 601; GCN-NEXT: ; %bb.2: ; %atomicrmw.end 602; GCN-NEXT: s_or_b64 exec, exec, s[2:3] 603; GCN-NEXT: global_store_dword v[0:1], v0, off 604; GCN-NEXT: s_endpgm 605 %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst 606 store float %result, float addrspace(1)* undef 607 ret void 608} 609 610define amdgpu_kernel void @global_atomic_fadd_noret_f32_wrong_subtarget(float addrspace(1)* %ptr) #1 { 611; GCN-LABEL: global_atomic_fadd_noret_f32_wrong_subtarget: 612; GCN: ; %bb.0: 613; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 614; GCN-NEXT: v_mov_b32_e32 v0, 0 615; GCN-NEXT: v_mov_b32_e32 v1, 4.0 616; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 617; GCN-NEXT: global_atomic_add_f32 v0, v1, s[0:1] 618; GCN-NEXT: s_waitcnt vmcnt(0) 619; GCN-NEXT: buffer_wbinvl1_vol 620; GCN-NEXT: s_endpgm 621 %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst 622 ret void 623} 624 625define amdgpu_kernel void @global_atomic_fadd_noret_f32_safe(float addrspace(1)* %ptr) { 626; GFX900-LABEL: global_atomic_fadd_noret_f32_safe: 627; GFX900: ; %bb.0: 628; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 629; GFX900-NEXT: s_mov_b64 s[2:3], 0 630; GFX900-NEXT: s_waitcnt lgkmcnt(0) 631; GFX900-NEXT: s_load_dword s4, s[0:1], 0x0 632; GFX900-NEXT: s_waitcnt lgkmcnt(0) 633; GFX900-NEXT: v_mov_b32_e32 v1, s4 634; GFX900-NEXT: BB8_1: ; %atomicrmw.start 635; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 636; GFX900-NEXT: v_mov_b32_e32 v2, 0 637; GFX900-NEXT: v_add_f32_e32 v0, 4.0, v1 638; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 639; GFX900-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc 640; GFX900-NEXT: s_waitcnt vmcnt(0) 641; GFX900-NEXT: buffer_wbinvl1_vol 642; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 643; GFX900-NEXT: s_or_b64 s[2:3], vcc, s[2:3] 644; GFX900-NEXT: v_mov_b32_e32 v1, v0 645; GFX900-NEXT: s_andn2_b64 exec, exec, s[2:3] 646; GFX900-NEXT: s_cbranch_execnz BB8_1 647; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end 648; GFX900-NEXT: s_endpgm 649; 650; GFX908-LABEL: global_atomic_fadd_noret_f32_safe: 651; GFX908: ; %bb.0: 652; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 653; GFX908-NEXT: s_mov_b64 s[2:3], 0 654; GFX908-NEXT: s_waitcnt lgkmcnt(0) 655; GFX908-NEXT: s_load_dword s4, s[0:1], 0x0 656; GFX908-NEXT: s_waitcnt lgkmcnt(0) 657; GFX908-NEXT: v_mov_b32_e32 v1, s4 658; GFX908-NEXT: BB8_1: ; %atomicrmw.start 659; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 660; GFX908-NEXT: v_mov_b32_e32 v2, 0 661; GFX908-NEXT: v_add_f32_e32 v0, 4.0, v1 662; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 663; GFX908-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc 664; GFX908-NEXT: s_waitcnt vmcnt(0) 665; GFX908-NEXT: buffer_wbinvl1_vol 666; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 667; GFX908-NEXT: s_or_b64 s[2:3], vcc, s[2:3] 668; GFX908-NEXT: v_mov_b32_e32 v1, v0 669; GFX908-NEXT: s_andn2_b64 exec, exec, s[2:3] 670; GFX908-NEXT: s_cbranch_execnz BB8_1 671; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 672; GFX908-NEXT: s_endpgm 673; 674; GFX90A-LABEL: global_atomic_fadd_noret_f32_safe: 675; GFX90A: ; %bb.0: 676; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 677; GFX90A-NEXT: s_mov_b64 s[2:3], 0 678; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 679; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x0 680; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 681; GFX90A-NEXT: v_mov_b32_e32 v1, s4 682; GFX90A-NEXT: BB8_1: ; %atomicrmw.start 683; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 684; GFX90A-NEXT: v_mov_b32_e32 v2, 0 685; GFX90A-NEXT: v_add_f32_e32 v0, 4.0, v1 686; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 687; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc 688; GFX90A-NEXT: s_waitcnt vmcnt(0) 689; GFX90A-NEXT: buffer_wbinvl1_vol 690; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 691; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] 692; GFX90A-NEXT: v_mov_b32_e32 v1, v0 693; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3] 694; GFX90A-NEXT: s_cbranch_execnz BB8_1 695; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 696; GFX90A-NEXT: s_endpgm 697; 698; GFX10-LABEL: global_atomic_fadd_noret_f32_safe: 699; GFX10: ; %bb.0: 700; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 701; GFX10-NEXT: s_waitcnt lgkmcnt(0) 702; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 703; GFX10-NEXT: s_waitcnt lgkmcnt(0) 704; GFX10-NEXT: v_mov_b32_e32 v1, s2 705; GFX10-NEXT: s_mov_b32 s2, 0 706; GFX10-NEXT: BB8_1: ; %atomicrmw.start 707; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 708; GFX10-NEXT: v_mov_b32_e32 v2, 0 709; GFX10-NEXT: v_add_f32_e32 v0, 4.0, v1 710; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 711; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 712; GFX10-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc 713; GFX10-NEXT: s_waitcnt vmcnt(0) 714; GFX10-NEXT: buffer_gl0_inv 715; GFX10-NEXT: buffer_gl1_inv 716; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 717; GFX10-NEXT: v_mov_b32_e32 v1, v0 718; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2 719; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 720; GFX10-NEXT: s_cbranch_execnz BB8_1 721; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 722; GFX10-NEXT: s_endpgm 723 %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst 724 ret void 725} 726 727define amdgpu_kernel void @infer_as_before_atomic(float* addrspace(4)* %arg) #0 { 728; GFX900-LABEL: infer_as_before_atomic: 729; GFX900: ; %bb.0: 730; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 731; GFX900-NEXT: s_mov_b64 s[2:3], 0 732; GFX900-NEXT: s_waitcnt lgkmcnt(0) 733; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 734; GFX900-NEXT: s_waitcnt lgkmcnt(0) 735; GFX900-NEXT: s_load_dword s4, s[0:1], 0x0 736; GFX900-NEXT: s_waitcnt lgkmcnt(0) 737; GFX900-NEXT: v_mov_b32_e32 v1, s4 738; GFX900-NEXT: BB9_1: ; %atomicrmw.start 739; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 740; GFX900-NEXT: v_add_f32_e32 v0, 1.0, v1 741; GFX900-NEXT: v_mov_b32_e32 v2, 0 742; GFX900-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc 743; GFX900-NEXT: s_waitcnt vmcnt(0) 744; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 745; GFX900-NEXT: s_or_b64 s[2:3], vcc, s[2:3] 746; GFX900-NEXT: v_mov_b32_e32 v1, v0 747; GFX900-NEXT: s_andn2_b64 exec, exec, s[2:3] 748; GFX900-NEXT: s_cbranch_execnz BB9_1 749; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end 750; GFX900-NEXT: s_endpgm 751; 752; GFX908-LABEL: infer_as_before_atomic: 753; GFX908: ; %bb.0: 754; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 755; GFX908-NEXT: v_mov_b32_e32 v0, 0 756; GFX908-NEXT: v_mov_b32_e32 v1, 1.0 757; GFX908-NEXT: s_waitcnt lgkmcnt(0) 758; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 759; GFX908-NEXT: s_waitcnt lgkmcnt(0) 760; GFX908-NEXT: global_atomic_add_f32 v0, v1, s[0:1] 761; GFX908-NEXT: s_endpgm 762; 763; GFX90A-LABEL: infer_as_before_atomic: 764; GFX90A: ; %bb.0: 765; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 766; GFX90A-NEXT: v_mov_b32_e32 v0, 0 767; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 768; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 769; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 770; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 771; GFX90A-NEXT: global_atomic_add_f32 v0, v1, s[0:1] 772; GFX90A-NEXT: s_endpgm 773; 774; GFX10-LABEL: infer_as_before_atomic: 775; GFX10: ; %bb.0: 776; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 777; GFX10-NEXT: s_waitcnt lgkmcnt(0) 778; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 779; GFX10-NEXT: s_waitcnt lgkmcnt(0) 780; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 781; GFX10-NEXT: s_waitcnt lgkmcnt(0) 782; GFX10-NEXT: v_mov_b32_e32 v1, s2 783; GFX10-NEXT: s_mov_b32 s2, 0 784; GFX10-NEXT: BB9_1: ; %atomicrmw.start 785; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 786; GFX10-NEXT: v_add_f32_e32 v0, 1.0, v1 787; GFX10-NEXT: v_mov_b32_e32 v2, 0 788; GFX10-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc 789; GFX10-NEXT: s_waitcnt vmcnt(0) 790; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 791; GFX10-NEXT: v_mov_b32_e32 v1, v0 792; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2 793; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 794; GFX10-NEXT: s_cbranch_execnz BB9_1 795; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 796; GFX10-NEXT: s_endpgm 797 %load = load float*, float* addrspace(4)* %arg 798 %v = atomicrmw fadd float* %load, float 1.0 syncscope("agent-one-as") monotonic, align 4 799 ret void 800} 801 802attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" } 803attributes #1 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "target-cpu"="gfx803" "target-features"="+atomic-fadd-insts" "amdgpu-unsafe-fp-atomics"="true" } 804attributes #2 = { "amdgpu-unsafe-fp-atomics"="true" } 805