1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s 3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s 4; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s 5; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s 6; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s 7; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s 8 9define amdgpu_kernel void @flat_system_unordered_load( 10; GFX7-LABEL: flat_system_unordered_load: 11; GFX7: ; %bb.0: ; %entry 12; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 13; GFX7-NEXT: s_waitcnt lgkmcnt(0) 14; GFX7-NEXT: v_mov_b32_e32 v0, s0 15; GFX7-NEXT: v_mov_b32_e32 v1, s1 16; GFX7-NEXT: flat_load_dword v0, v[0:1] 17; GFX7-NEXT: v_mov_b32_e32 v2, s2 18; GFX7-NEXT: v_mov_b32_e32 v3, s3 19; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 20; GFX7-NEXT: flat_store_dword v[2:3], v0 21; GFX7-NEXT: s_endpgm 22; 23; GFX10-WGP-LABEL: flat_system_unordered_load: 24; GFX10-WGP: ; %bb.0: ; %entry 25; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 26; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 27; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 28; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 29; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] 30; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 31; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 32; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 33; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 34; GFX10-WGP-NEXT: s_endpgm 35; 36; GFX10-CU-LABEL: flat_system_unordered_load: 37; GFX10-CU: ; %bb.0: ; %entry 38; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 39; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 40; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 41; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 42; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] 43; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 44; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 45; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 46; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 47; GFX10-CU-NEXT: s_endpgm 48; 49; SKIP-CACHE-INV-LABEL: flat_system_unordered_load: 50; SKIP-CACHE-INV: ; %bb.0: ; %entry 51; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 52; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 53; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 54; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 55; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] 56; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 57; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 58; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 59; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 60; SKIP-CACHE-INV-NEXT: s_endpgm 61; 62; GFX90A-NOTTGSPLIT-LABEL: flat_system_unordered_load: 63; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 64; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 65; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 66; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 67; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 68; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] 69; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 70; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 71; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 72; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 73; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 74; 75; GFX90A-TGSPLIT-LABEL: flat_system_unordered_load: 76; GFX90A-TGSPLIT: ; %bb.0: ; %entry 77; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 78; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 79; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 80; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 81; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] 82; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 83; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 84; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 85; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 86; GFX90A-TGSPLIT-NEXT: s_endpgm 87 i32* %in, i32* %out) { 88entry: 89 %val = load atomic i32, i32* %in unordered, align 4 90 store i32 %val, i32* %out 91 ret void 92} 93 94define amdgpu_kernel void @flat_system_monotonic_load( 95; GFX7-LABEL: flat_system_monotonic_load: 96; GFX7: ; %bb.0: ; %entry 97; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 98; GFX7-NEXT: s_waitcnt lgkmcnt(0) 99; GFX7-NEXT: v_mov_b32_e32 v0, s0 100; GFX7-NEXT: v_mov_b32_e32 v1, s1 101; GFX7-NEXT: flat_load_dword v0, v[0:1] glc 102; GFX7-NEXT: v_mov_b32_e32 v2, s2 103; GFX7-NEXT: v_mov_b32_e32 v3, s3 104; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 105; GFX7-NEXT: flat_store_dword v[2:3], v0 106; GFX7-NEXT: s_endpgm 107; 108; GFX10-WGP-LABEL: flat_system_monotonic_load: 109; GFX10-WGP: ; %bb.0: ; %entry 110; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 111; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 112; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 113; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 114; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc 115; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 116; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 117; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 118; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 119; GFX10-WGP-NEXT: s_endpgm 120; 121; GFX10-CU-LABEL: flat_system_monotonic_load: 122; GFX10-CU: ; %bb.0: ; %entry 123; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 124; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 125; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 126; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 127; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc 128; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 129; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 130; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 131; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 132; GFX10-CU-NEXT: s_endpgm 133; 134; SKIP-CACHE-INV-LABEL: flat_system_monotonic_load: 135; SKIP-CACHE-INV: ; %bb.0: ; %entry 136; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 137; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 138; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 139; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 140; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] glc 141; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 142; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 143; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 144; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 145; SKIP-CACHE-INV-NEXT: s_endpgm 146; 147; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_load: 148; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 149; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 150; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 151; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 152; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 153; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc 154; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 155; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 156; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 157; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 158; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 159; 160; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_load: 161; GFX90A-TGSPLIT: ; %bb.0: ; %entry 162; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 163; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 164; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 165; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 166; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc 167; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 168; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 169; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 170; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 171; GFX90A-TGSPLIT-NEXT: s_endpgm 172 i32* %in, i32* %out) { 173entry: 174 %val = load atomic i32, i32* %in monotonic, align 4 175 store i32 %val, i32* %out 176 ret void 177} 178 179define amdgpu_kernel void @flat_system_acquire_load( 180; GFX7-LABEL: flat_system_acquire_load: 181; GFX7: ; %bb.0: ; %entry 182; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 183; GFX7-NEXT: s_waitcnt lgkmcnt(0) 184; GFX7-NEXT: v_mov_b32_e32 v0, s0 185; GFX7-NEXT: v_mov_b32_e32 v1, s1 186; GFX7-NEXT: flat_load_dword v0, v[0:1] glc 187; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 188; GFX7-NEXT: buffer_wbinvl1_vol 189; GFX7-NEXT: v_mov_b32_e32 v2, s2 190; GFX7-NEXT: v_mov_b32_e32 v3, s3 191; GFX7-NEXT: flat_store_dword v[2:3], v0 192; GFX7-NEXT: s_endpgm 193; 194; GFX10-WGP-LABEL: flat_system_acquire_load: 195; GFX10-WGP: ; %bb.0: ; %entry 196; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 197; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 198; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 199; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 200; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc 201; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 202; GFX10-WGP-NEXT: buffer_gl0_inv 203; GFX10-WGP-NEXT: buffer_gl1_inv 204; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 205; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 206; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 207; GFX10-WGP-NEXT: s_endpgm 208; 209; GFX10-CU-LABEL: flat_system_acquire_load: 210; GFX10-CU: ; %bb.0: ; %entry 211; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 212; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 213; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 214; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 215; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc 216; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 217; GFX10-CU-NEXT: buffer_gl0_inv 218; GFX10-CU-NEXT: buffer_gl1_inv 219; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 220; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 221; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 222; GFX10-CU-NEXT: s_endpgm 223; 224; SKIP-CACHE-INV-LABEL: flat_system_acquire_load: 225; SKIP-CACHE-INV: ; %bb.0: ; %entry 226; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 227; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 228; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 229; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 230; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] glc 231; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 232; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 233; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 234; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 235; SKIP-CACHE-INV-NEXT: s_endpgm 236; 237; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_load: 238; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 239; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 240; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 241; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 242; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 243; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc 244; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 245; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 246; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol 247; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 248; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 249; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 250; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 251; 252; GFX90A-TGSPLIT-LABEL: flat_system_acquire_load: 253; GFX90A-TGSPLIT: ; %bb.0: ; %entry 254; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 255; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 256; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 257; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 258; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc 259; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 260; GFX90A-TGSPLIT-NEXT: buffer_invl2 261; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 262; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 263; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 264; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 265; GFX90A-TGSPLIT-NEXT: s_endpgm 266 i32* %in, i32* %out) { 267entry: 268 %val = load atomic i32, i32* %in acquire, align 4 269 store i32 %val, i32* %out 270 ret void 271} 272 273define amdgpu_kernel void @flat_system_seq_cst_load( 274; GFX7-LABEL: flat_system_seq_cst_load: 275; GFX7: ; %bb.0: ; %entry 276; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 277; GFX7-NEXT: s_waitcnt lgkmcnt(0) 278; GFX7-NEXT: v_mov_b32_e32 v0, s0 279; GFX7-NEXT: v_mov_b32_e32 v1, s1 280; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 281; GFX7-NEXT: flat_load_dword v0, v[0:1] glc 282; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 283; GFX7-NEXT: buffer_wbinvl1_vol 284; GFX7-NEXT: v_mov_b32_e32 v2, s2 285; GFX7-NEXT: v_mov_b32_e32 v3, s3 286; GFX7-NEXT: flat_store_dword v[2:3], v0 287; GFX7-NEXT: s_endpgm 288; 289; GFX10-WGP-LABEL: flat_system_seq_cst_load: 290; GFX10-WGP: ; %bb.0: ; %entry 291; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 292; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 293; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 294; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 295; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 296; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 297; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc 298; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 299; GFX10-WGP-NEXT: buffer_gl0_inv 300; GFX10-WGP-NEXT: buffer_gl1_inv 301; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 302; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 303; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 304; GFX10-WGP-NEXT: s_endpgm 305; 306; GFX10-CU-LABEL: flat_system_seq_cst_load: 307; GFX10-CU: ; %bb.0: ; %entry 308; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 309; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 310; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 311; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 312; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 313; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 314; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc 315; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 316; GFX10-CU-NEXT: buffer_gl0_inv 317; GFX10-CU-NEXT: buffer_gl1_inv 318; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 319; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 320; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 321; GFX10-CU-NEXT: s_endpgm 322; 323; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_load: 324; SKIP-CACHE-INV: ; %bb.0: ; %entry 325; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 326; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 327; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 328; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 329; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 330; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] glc 331; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 332; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 333; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 334; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 335; SKIP-CACHE-INV-NEXT: s_endpgm 336; 337; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_load: 338; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 339; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 340; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 341; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 342; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 343; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 344; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc 345; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 346; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 347; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol 348; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 349; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 350; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 351; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 352; 353; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_load: 354; GFX90A-TGSPLIT: ; %bb.0: ; %entry 355; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 356; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 357; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 358; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 359; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 360; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc 361; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 362; GFX90A-TGSPLIT-NEXT: buffer_invl2 363; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 364; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 365; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 366; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 367; GFX90A-TGSPLIT-NEXT: s_endpgm 368 i32* %in, i32* %out) { 369entry: 370 %val = load atomic i32, i32* %in seq_cst, align 4 371 store i32 %val, i32* %out 372 ret void 373} 374 375define amdgpu_kernel void @flat_system_unordered_store( 376; GFX7-LABEL: flat_system_unordered_store: 377; GFX7: ; %bb.0: ; %entry 378; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 379; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 380; GFX7-NEXT: s_waitcnt lgkmcnt(0) 381; GFX7-NEXT: v_mov_b32_e32 v2, s2 382; GFX7-NEXT: v_mov_b32_e32 v0, s0 383; GFX7-NEXT: v_mov_b32_e32 v1, s1 384; GFX7-NEXT: flat_store_dword v[0:1], v2 385; GFX7-NEXT: s_endpgm 386; 387; GFX10-WGP-LABEL: flat_system_unordered_store: 388; GFX10-WGP: ; %bb.0: ; %entry 389; GFX10-WGP-NEXT: s_clause 0x1 390; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 391; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 392; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 393; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 394; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 395; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 396; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 397; GFX10-WGP-NEXT: s_endpgm 398; 399; GFX10-CU-LABEL: flat_system_unordered_store: 400; GFX10-CU: ; %bb.0: ; %entry 401; GFX10-CU-NEXT: s_clause 0x1 402; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 403; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 404; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 405; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 406; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 407; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 408; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 409; GFX10-CU-NEXT: s_endpgm 410; 411; SKIP-CACHE-INV-LABEL: flat_system_unordered_store: 412; SKIP-CACHE-INV: ; %bb.0: ; %entry 413; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 414; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 415; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 416; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 417; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 418; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 419; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 420; SKIP-CACHE-INV-NEXT: s_endpgm 421; 422; GFX90A-NOTTGSPLIT-LABEL: flat_system_unordered_store: 423; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 424; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 425; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 426; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 427; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 428; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 429; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 430; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 431; 432; GFX90A-TGSPLIT-LABEL: flat_system_unordered_store: 433; GFX90A-TGSPLIT: ; %bb.0: ; %entry 434; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 435; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 436; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 437; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 438; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 439; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 440; GFX90A-TGSPLIT-NEXT: s_endpgm 441 i32 %in, i32* %out) { 442entry: 443 store atomic i32 %in, i32* %out unordered, align 4 444 ret void 445} 446 447define amdgpu_kernel void @flat_system_monotonic_store( 448; GFX7-LABEL: flat_system_monotonic_store: 449; GFX7: ; %bb.0: ; %entry 450; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 451; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 452; GFX7-NEXT: s_waitcnt lgkmcnt(0) 453; GFX7-NEXT: v_mov_b32_e32 v2, s2 454; GFX7-NEXT: v_mov_b32_e32 v0, s0 455; GFX7-NEXT: v_mov_b32_e32 v1, s1 456; GFX7-NEXT: flat_store_dword v[0:1], v2 457; GFX7-NEXT: s_endpgm 458; 459; GFX10-WGP-LABEL: flat_system_monotonic_store: 460; GFX10-WGP: ; %bb.0: ; %entry 461; GFX10-WGP-NEXT: s_clause 0x1 462; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 463; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 464; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 465; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 466; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 467; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 468; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 469; GFX10-WGP-NEXT: s_endpgm 470; 471; GFX10-CU-LABEL: flat_system_monotonic_store: 472; GFX10-CU: ; %bb.0: ; %entry 473; GFX10-CU-NEXT: s_clause 0x1 474; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 475; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 476; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 477; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 478; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 479; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 480; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 481; GFX10-CU-NEXT: s_endpgm 482; 483; SKIP-CACHE-INV-LABEL: flat_system_monotonic_store: 484; SKIP-CACHE-INV: ; %bb.0: ; %entry 485; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 486; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 487; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 488; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 489; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 490; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 491; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 492; SKIP-CACHE-INV-NEXT: s_endpgm 493; 494; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_store: 495; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 496; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 497; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 498; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 499; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 500; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 501; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 502; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 503; 504; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_store: 505; GFX90A-TGSPLIT: ; %bb.0: ; %entry 506; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 507; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 508; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 509; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 510; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 511; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 512; GFX90A-TGSPLIT-NEXT: s_endpgm 513 i32 %in, i32* %out) { 514entry: 515 store atomic i32 %in, i32* %out monotonic, align 4 516 ret void 517} 518 519define amdgpu_kernel void @flat_system_release_store( 520; GFX7-LABEL: flat_system_release_store: 521; GFX7: ; %bb.0: ; %entry 522; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 523; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 524; GFX7-NEXT: s_waitcnt lgkmcnt(0) 525; GFX7-NEXT: v_mov_b32_e32 v2, s2 526; GFX7-NEXT: v_mov_b32_e32 v0, s0 527; GFX7-NEXT: v_mov_b32_e32 v1, s1 528; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 529; GFX7-NEXT: flat_store_dword v[0:1], v2 530; GFX7-NEXT: s_endpgm 531; 532; GFX10-WGP-LABEL: flat_system_release_store: 533; GFX10-WGP: ; %bb.0: ; %entry 534; GFX10-WGP-NEXT: s_clause 0x1 535; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 536; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 537; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 538; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 539; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 540; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 541; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 542; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 543; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 544; GFX10-WGP-NEXT: s_endpgm 545; 546; GFX10-CU-LABEL: flat_system_release_store: 547; GFX10-CU: ; %bb.0: ; %entry 548; GFX10-CU-NEXT: s_clause 0x1 549; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 550; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 551; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 552; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 553; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 554; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 555; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 556; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 557; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 558; GFX10-CU-NEXT: s_endpgm 559; 560; SKIP-CACHE-INV-LABEL: flat_system_release_store: 561; SKIP-CACHE-INV: ; %bb.0: ; %entry 562; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 563; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 564; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 565; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 566; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 567; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 568; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 569; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 570; SKIP-CACHE-INV-NEXT: s_endpgm 571; 572; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_store: 573; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 574; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 575; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 576; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 577; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 578; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 579; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 580; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 581; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 582; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 583; 584; GFX90A-TGSPLIT-LABEL: flat_system_release_store: 585; GFX90A-TGSPLIT: ; %bb.0: ; %entry 586; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 587; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 588; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 589; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 590; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 591; GFX90A-TGSPLIT-NEXT: buffer_wbl2 592; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 593; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 594; GFX90A-TGSPLIT-NEXT: s_endpgm 595 i32 %in, i32* %out) { 596entry: 597 store atomic i32 %in, i32* %out release, align 4 598 ret void 599} 600 601define amdgpu_kernel void @flat_system_seq_cst_store( 602; GFX7-LABEL: flat_system_seq_cst_store: 603; GFX7: ; %bb.0: ; %entry 604; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 605; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 606; GFX7-NEXT: s_waitcnt lgkmcnt(0) 607; GFX7-NEXT: v_mov_b32_e32 v2, s2 608; GFX7-NEXT: v_mov_b32_e32 v0, s0 609; GFX7-NEXT: v_mov_b32_e32 v1, s1 610; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 611; GFX7-NEXT: flat_store_dword v[0:1], v2 612; GFX7-NEXT: s_endpgm 613; 614; GFX10-WGP-LABEL: flat_system_seq_cst_store: 615; GFX10-WGP: ; %bb.0: ; %entry 616; GFX10-WGP-NEXT: s_clause 0x1 617; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 618; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 619; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 620; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 621; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 622; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 623; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 624; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 625; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 626; GFX10-WGP-NEXT: s_endpgm 627; 628; GFX10-CU-LABEL: flat_system_seq_cst_store: 629; GFX10-CU: ; %bb.0: ; %entry 630; GFX10-CU-NEXT: s_clause 0x1 631; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 632; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 633; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 634; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 635; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 636; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 637; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 638; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 639; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 640; GFX10-CU-NEXT: s_endpgm 641; 642; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_store: 643; SKIP-CACHE-INV: ; %bb.0: ; %entry 644; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 645; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 646; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 647; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 648; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 649; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 650; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 651; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 652; SKIP-CACHE-INV-NEXT: s_endpgm 653; 654; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_store: 655; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 656; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 657; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 658; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 659; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 660; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 661; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 662; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 663; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 664; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 665; 666; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_store: 667; GFX90A-TGSPLIT: ; %bb.0: ; %entry 668; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 669; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 670; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 671; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 672; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 673; GFX90A-TGSPLIT-NEXT: buffer_wbl2 674; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 675; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 676; GFX90A-TGSPLIT-NEXT: s_endpgm 677 i32 %in, i32* %out) { 678entry: 679 store atomic i32 %in, i32* %out seq_cst, align 4 680 ret void 681} 682 683define amdgpu_kernel void @flat_system_monotonic_atomicrmw( 684; GFX7-LABEL: flat_system_monotonic_atomicrmw: 685; GFX7: ; %bb.0: ; %entry 686; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 687; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 688; GFX7-NEXT: s_waitcnt lgkmcnt(0) 689; GFX7-NEXT: v_mov_b32_e32 v0, s0 690; GFX7-NEXT: v_mov_b32_e32 v1, s1 691; GFX7-NEXT: v_mov_b32_e32 v2, s2 692; GFX7-NEXT: flat_atomic_swap v[0:1], v2 693; GFX7-NEXT: s_endpgm 694; 695; GFX10-WGP-LABEL: flat_system_monotonic_atomicrmw: 696; GFX10-WGP: ; %bb.0: ; %entry 697; GFX10-WGP-NEXT: s_clause 0x1 698; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 699; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 700; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 701; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 702; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 703; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 704; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 705; GFX10-WGP-NEXT: s_endpgm 706; 707; GFX10-CU-LABEL: flat_system_monotonic_atomicrmw: 708; GFX10-CU: ; %bb.0: ; %entry 709; GFX10-CU-NEXT: s_clause 0x1 710; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 711; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 712; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 713; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 714; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 715; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 716; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 717; GFX10-CU-NEXT: s_endpgm 718; 719; SKIP-CACHE-INV-LABEL: flat_system_monotonic_atomicrmw: 720; SKIP-CACHE-INV: ; %bb.0: ; %entry 721; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 722; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 723; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 724; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 725; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 726; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 727; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 728; SKIP-CACHE-INV-NEXT: s_endpgm 729; 730; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_atomicrmw: 731; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 732; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 733; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 734; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 735; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 736; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 737; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 738; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 739; 740; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_atomicrmw: 741; GFX90A-TGSPLIT: ; %bb.0: ; %entry 742; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 743; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 744; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 745; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 746; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 747; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 748; GFX90A-TGSPLIT-NEXT: s_endpgm 749 i32* %out, i32 %in) { 750entry: 751 %val = atomicrmw volatile xchg i32* %out, i32 %in monotonic 752 ret void 753} 754 755define amdgpu_kernel void @flat_system_acquire_atomicrmw( 756; GFX7-LABEL: flat_system_acquire_atomicrmw: 757; GFX7: ; %bb.0: ; %entry 758; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 759; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 760; GFX7-NEXT: s_waitcnt lgkmcnt(0) 761; GFX7-NEXT: v_mov_b32_e32 v0, s0 762; GFX7-NEXT: v_mov_b32_e32 v1, s1 763; GFX7-NEXT: v_mov_b32_e32 v2, s2 764; GFX7-NEXT: flat_atomic_swap v[0:1], v2 765; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 766; GFX7-NEXT: buffer_wbinvl1_vol 767; GFX7-NEXT: s_endpgm 768; 769; GFX10-WGP-LABEL: flat_system_acquire_atomicrmw: 770; GFX10-WGP: ; %bb.0: ; %entry 771; GFX10-WGP-NEXT: s_clause 0x1 772; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 773; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 774; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 775; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 776; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 777; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 778; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 779; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 780; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 781; GFX10-WGP-NEXT: buffer_gl0_inv 782; GFX10-WGP-NEXT: buffer_gl1_inv 783; GFX10-WGP-NEXT: s_endpgm 784; 785; GFX10-CU-LABEL: flat_system_acquire_atomicrmw: 786; GFX10-CU: ; %bb.0: ; %entry 787; GFX10-CU-NEXT: s_clause 0x1 788; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 789; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 790; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 791; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 792; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 793; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 794; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 795; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 796; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 797; GFX10-CU-NEXT: buffer_gl0_inv 798; GFX10-CU-NEXT: buffer_gl1_inv 799; GFX10-CU-NEXT: s_endpgm 800; 801; SKIP-CACHE-INV-LABEL: flat_system_acquire_atomicrmw: 802; SKIP-CACHE-INV: ; %bb.0: ; %entry 803; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 804; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 805; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 806; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 807; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 808; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 809; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 810; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 811; SKIP-CACHE-INV-NEXT: s_endpgm 812; 813; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_atomicrmw: 814; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 815; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 816; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 817; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 818; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 819; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 820; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 821; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 822; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 823; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol 824; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 825; 826; GFX90A-TGSPLIT-LABEL: flat_system_acquire_atomicrmw: 827; GFX90A-TGSPLIT: ; %bb.0: ; %entry 828; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 829; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 830; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 831; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 832; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 833; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 834; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 835; GFX90A-TGSPLIT-NEXT: buffer_invl2 836; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 837; GFX90A-TGSPLIT-NEXT: s_endpgm 838 i32* %out, i32 %in) { 839entry: 840 %val = atomicrmw volatile xchg i32* %out, i32 %in acquire 841 ret void 842} 843 844define amdgpu_kernel void @flat_system_release_atomicrmw( 845; GFX7-LABEL: flat_system_release_atomicrmw: 846; GFX7: ; %bb.0: ; %entry 847; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 848; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 849; GFX7-NEXT: s_waitcnt lgkmcnt(0) 850; GFX7-NEXT: v_mov_b32_e32 v0, s0 851; GFX7-NEXT: v_mov_b32_e32 v1, s1 852; GFX7-NEXT: v_mov_b32_e32 v2, s2 853; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 854; GFX7-NEXT: flat_atomic_swap v[0:1], v2 855; GFX7-NEXT: s_endpgm 856; 857; GFX10-WGP-LABEL: flat_system_release_atomicrmw: 858; GFX10-WGP: ; %bb.0: ; %entry 859; GFX10-WGP-NEXT: s_clause 0x1 860; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 861; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 862; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 863; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 864; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 865; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 866; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 867; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 868; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 869; GFX10-WGP-NEXT: s_endpgm 870; 871; GFX10-CU-LABEL: flat_system_release_atomicrmw: 872; GFX10-CU: ; %bb.0: ; %entry 873; GFX10-CU-NEXT: s_clause 0x1 874; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 875; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 876; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 877; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 878; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 879; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 880; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 881; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 882; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 883; GFX10-CU-NEXT: s_endpgm 884; 885; SKIP-CACHE-INV-LABEL: flat_system_release_atomicrmw: 886; SKIP-CACHE-INV: ; %bb.0: ; %entry 887; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 888; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 889; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 890; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 891; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 892; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 893; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 894; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 895; SKIP-CACHE-INV-NEXT: s_endpgm 896; 897; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_atomicrmw: 898; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 899; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 900; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 901; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 902; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 903; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 904; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 905; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 906; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 907; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 908; 909; GFX90A-TGSPLIT-LABEL: flat_system_release_atomicrmw: 910; GFX90A-TGSPLIT: ; %bb.0: ; %entry 911; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 912; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 913; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 914; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 915; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 916; GFX90A-TGSPLIT-NEXT: buffer_wbl2 917; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 918; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 919; GFX90A-TGSPLIT-NEXT: s_endpgm 920 i32* %out, i32 %in) { 921entry: 922 %val = atomicrmw volatile xchg i32* %out, i32 %in release 923 ret void 924} 925 926define amdgpu_kernel void @flat_system_acq_rel_atomicrmw( 927; GFX7-LABEL: flat_system_acq_rel_atomicrmw: 928; GFX7: ; %bb.0: ; %entry 929; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 930; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 931; GFX7-NEXT: s_waitcnt lgkmcnt(0) 932; GFX7-NEXT: v_mov_b32_e32 v0, s0 933; GFX7-NEXT: v_mov_b32_e32 v1, s1 934; GFX7-NEXT: v_mov_b32_e32 v2, s2 935; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 936; GFX7-NEXT: flat_atomic_swap v[0:1], v2 937; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 938; GFX7-NEXT: buffer_wbinvl1_vol 939; GFX7-NEXT: s_endpgm 940; 941; GFX10-WGP-LABEL: flat_system_acq_rel_atomicrmw: 942; GFX10-WGP: ; %bb.0: ; %entry 943; GFX10-WGP-NEXT: s_clause 0x1 944; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 945; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 946; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 947; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 948; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 949; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 950; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 951; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 952; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 953; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 954; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 955; GFX10-WGP-NEXT: buffer_gl0_inv 956; GFX10-WGP-NEXT: buffer_gl1_inv 957; GFX10-WGP-NEXT: s_endpgm 958; 959; GFX10-CU-LABEL: flat_system_acq_rel_atomicrmw: 960; GFX10-CU: ; %bb.0: ; %entry 961; GFX10-CU-NEXT: s_clause 0x1 962; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 963; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 964; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 965; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 966; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 967; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 968; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 969; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 970; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 971; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 972; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 973; GFX10-CU-NEXT: buffer_gl0_inv 974; GFX10-CU-NEXT: buffer_gl1_inv 975; GFX10-CU-NEXT: s_endpgm 976; 977; SKIP-CACHE-INV-LABEL: flat_system_acq_rel_atomicrmw: 978; SKIP-CACHE-INV: ; %bb.0: ; %entry 979; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 980; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 981; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 982; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 983; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 984; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 985; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 986; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 987; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 988; SKIP-CACHE-INV-NEXT: s_endpgm 989; 990; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_atomicrmw: 991; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 992; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 993; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 994; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 995; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 996; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 997; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 998; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 999; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 1000; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1001; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 1002; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol 1003; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 1004; 1005; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_atomicrmw: 1006; GFX90A-TGSPLIT: ; %bb.0: ; %entry 1007; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1008; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 1009; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1010; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1011; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 1012; GFX90A-TGSPLIT-NEXT: buffer_wbl2 1013; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1014; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 1015; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 1016; GFX90A-TGSPLIT-NEXT: buffer_invl2 1017; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 1018; GFX90A-TGSPLIT-NEXT: s_endpgm 1019 i32* %out, i32 %in) { 1020entry: 1021 %val = atomicrmw volatile xchg i32* %out, i32 %in acq_rel 1022 ret void 1023} 1024 1025define amdgpu_kernel void @flat_system_seq_cst_atomicrmw( 1026; GFX7-LABEL: flat_system_seq_cst_atomicrmw: 1027; GFX7: ; %bb.0: ; %entry 1028; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1029; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 1030; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1031; GFX7-NEXT: v_mov_b32_e32 v0, s0 1032; GFX7-NEXT: v_mov_b32_e32 v1, s1 1033; GFX7-NEXT: v_mov_b32_e32 v2, s2 1034; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1035; GFX7-NEXT: flat_atomic_swap v[0:1], v2 1036; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1037; GFX7-NEXT: buffer_wbinvl1_vol 1038; GFX7-NEXT: s_endpgm 1039; 1040; GFX10-WGP-LABEL: flat_system_seq_cst_atomicrmw: 1041; GFX10-WGP: ; %bb.0: ; %entry 1042; GFX10-WGP-NEXT: s_clause 0x1 1043; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1044; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 1045; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1046; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1047; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 1048; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 1049; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1050; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 1051; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 1052; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1053; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 1054; GFX10-WGP-NEXT: buffer_gl0_inv 1055; GFX10-WGP-NEXT: buffer_gl1_inv 1056; GFX10-WGP-NEXT: s_endpgm 1057; 1058; GFX10-CU-LABEL: flat_system_seq_cst_atomicrmw: 1059; GFX10-CU: ; %bb.0: ; %entry 1060; GFX10-CU-NEXT: s_clause 0x1 1061; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1062; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 1063; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1064; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1065; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 1066; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 1067; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1068; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 1069; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 1070; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1071; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 1072; GFX10-CU-NEXT: buffer_gl0_inv 1073; GFX10-CU-NEXT: buffer_gl1_inv 1074; GFX10-CU-NEXT: s_endpgm 1075; 1076; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_atomicrmw: 1077; SKIP-CACHE-INV: ; %bb.0: ; %entry 1078; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 1079; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 1080; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1081; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1082; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 1083; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 1084; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1085; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 1086; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1087; SKIP-CACHE-INV-NEXT: s_endpgm 1088; 1089; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_atomicrmw: 1090; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 1091; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1092; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 1093; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1094; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1095; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 1096; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 1097; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1098; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 1099; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1100; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 1101; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol 1102; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 1103; 1104; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_atomicrmw: 1105; GFX90A-TGSPLIT: ; %bb.0: ; %entry 1106; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1107; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 1108; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1109; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1110; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 1111; GFX90A-TGSPLIT-NEXT: buffer_wbl2 1112; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1113; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 1114; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 1115; GFX90A-TGSPLIT-NEXT: buffer_invl2 1116; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 1117; GFX90A-TGSPLIT-NEXT: s_endpgm 1118 i32* %out, i32 %in) { 1119entry: 1120 %val = atomicrmw volatile xchg i32* %out, i32 %in seq_cst 1121 ret void 1122} 1123 1124define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw( 1125; GFX7-LABEL: flat_system_acquire_ret_atomicrmw: 1126; GFX7: ; %bb.0: ; %entry 1127; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1128; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 1129; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1130; GFX7-NEXT: v_mov_b32_e32 v0, s0 1131; GFX7-NEXT: v_mov_b32_e32 v1, s1 1132; GFX7-NEXT: v_mov_b32_e32 v2, s2 1133; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1134; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1135; GFX7-NEXT: buffer_wbinvl1_vol 1136; GFX7-NEXT: flat_store_dword v[0:1], v2 1137; GFX7-NEXT: s_endpgm 1138; 1139; GFX10-WGP-LABEL: flat_system_acquire_ret_atomicrmw: 1140; GFX10-WGP: ; %bb.0: ; %entry 1141; GFX10-WGP-NEXT: s_clause 0x1 1142; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1143; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 1144; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1145; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1146; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 1147; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 1148; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1149; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1150; GFX10-WGP-NEXT: buffer_gl0_inv 1151; GFX10-WGP-NEXT: buffer_gl1_inv 1152; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 1153; GFX10-WGP-NEXT: s_endpgm 1154; 1155; GFX10-CU-LABEL: flat_system_acquire_ret_atomicrmw: 1156; GFX10-CU: ; %bb.0: ; %entry 1157; GFX10-CU-NEXT: s_clause 0x1 1158; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1159; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 1160; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1161; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1162; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 1163; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 1164; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1165; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1166; GFX10-CU-NEXT: buffer_gl0_inv 1167; GFX10-CU-NEXT: buffer_gl1_inv 1168; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 1169; GFX10-CU-NEXT: s_endpgm 1170; 1171; SKIP-CACHE-INV-LABEL: flat_system_acquire_ret_atomicrmw: 1172; SKIP-CACHE-INV: ; %bb.0: ; %entry 1173; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 1174; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 1175; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1176; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1177; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 1178; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 1179; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1180; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1181; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 1182; SKIP-CACHE-INV-NEXT: s_endpgm 1183; 1184; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_ret_atomicrmw: 1185; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 1186; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1187; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 1188; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1189; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1190; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 1191; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1192; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1193; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 1194; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol 1195; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 1196; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 1197; 1198; GFX90A-TGSPLIT-LABEL: flat_system_acquire_ret_atomicrmw: 1199; GFX90A-TGSPLIT: ; %bb.0: ; %entry 1200; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1201; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 1202; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1203; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1204; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 1205; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1206; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 1207; GFX90A-TGSPLIT-NEXT: buffer_invl2 1208; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 1209; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 1210; GFX90A-TGSPLIT-NEXT: s_endpgm 1211 i32* %out, i32 %in) { 1212entry: 1213 %val = atomicrmw volatile xchg i32* %out, i32 %in acquire 1214 store i32 %val, i32* %out, align 4 1215 ret void 1216} 1217 1218define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw( 1219; GFX7-LABEL: flat_system_acq_rel_ret_atomicrmw: 1220; GFX7: ; %bb.0: ; %entry 1221; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1222; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 1223; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1224; GFX7-NEXT: v_mov_b32_e32 v0, s0 1225; GFX7-NEXT: v_mov_b32_e32 v1, s1 1226; GFX7-NEXT: v_mov_b32_e32 v2, s2 1227; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1228; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1229; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1230; GFX7-NEXT: buffer_wbinvl1_vol 1231; GFX7-NEXT: flat_store_dword v[0:1], v2 1232; GFX7-NEXT: s_endpgm 1233; 1234; GFX10-WGP-LABEL: flat_system_acq_rel_ret_atomicrmw: 1235; GFX10-WGP: ; %bb.0: ; %entry 1236; GFX10-WGP-NEXT: s_clause 0x1 1237; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1238; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 1239; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1240; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1241; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 1242; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 1243; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1244; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 1245; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1246; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1247; GFX10-WGP-NEXT: buffer_gl0_inv 1248; GFX10-WGP-NEXT: buffer_gl1_inv 1249; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 1250; GFX10-WGP-NEXT: s_endpgm 1251; 1252; GFX10-CU-LABEL: flat_system_acq_rel_ret_atomicrmw: 1253; GFX10-CU: ; %bb.0: ; %entry 1254; GFX10-CU-NEXT: s_clause 0x1 1255; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1256; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 1257; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1258; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1259; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 1260; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 1261; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1262; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 1263; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1264; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1265; GFX10-CU-NEXT: buffer_gl0_inv 1266; GFX10-CU-NEXT: buffer_gl1_inv 1267; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 1268; GFX10-CU-NEXT: s_endpgm 1269; 1270; SKIP-CACHE-INV-LABEL: flat_system_acq_rel_ret_atomicrmw: 1271; SKIP-CACHE-INV: ; %bb.0: ; %entry 1272; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 1273; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 1274; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1275; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1276; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 1277; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 1278; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1279; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1280; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1281; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 1282; SKIP-CACHE-INV-NEXT: s_endpgm 1283; 1284; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_ret_atomicrmw: 1285; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 1286; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1287; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 1288; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1289; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1290; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 1291; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 1292; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1293; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1294; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1295; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 1296; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol 1297; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 1298; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 1299; 1300; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_ret_atomicrmw: 1301; GFX90A-TGSPLIT: ; %bb.0: ; %entry 1302; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1303; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 1304; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1305; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1306; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 1307; GFX90A-TGSPLIT-NEXT: buffer_wbl2 1308; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1309; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1310; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 1311; GFX90A-TGSPLIT-NEXT: buffer_invl2 1312; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 1313; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 1314; GFX90A-TGSPLIT-NEXT: s_endpgm 1315 i32* %out, i32 %in) { 1316entry: 1317 %val = atomicrmw volatile xchg i32* %out, i32 %in acq_rel 1318 store i32 %val, i32* %out, align 4 1319 ret void 1320} 1321 1322define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw( 1323; GFX7-LABEL: flat_system_seq_cst_ret_atomicrmw: 1324; GFX7: ; %bb.0: ; %entry 1325; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1326; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 1327; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1328; GFX7-NEXT: v_mov_b32_e32 v0, s0 1329; GFX7-NEXT: v_mov_b32_e32 v1, s1 1330; GFX7-NEXT: v_mov_b32_e32 v2, s2 1331; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1332; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1333; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1334; GFX7-NEXT: buffer_wbinvl1_vol 1335; GFX7-NEXT: flat_store_dword v[0:1], v2 1336; GFX7-NEXT: s_endpgm 1337; 1338; GFX10-WGP-LABEL: flat_system_seq_cst_ret_atomicrmw: 1339; GFX10-WGP: ; %bb.0: ; %entry 1340; GFX10-WGP-NEXT: s_clause 0x1 1341; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1342; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 1343; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1344; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1345; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 1346; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 1347; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1348; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 1349; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1350; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1351; GFX10-WGP-NEXT: buffer_gl0_inv 1352; GFX10-WGP-NEXT: buffer_gl1_inv 1353; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 1354; GFX10-WGP-NEXT: s_endpgm 1355; 1356; GFX10-CU-LABEL: flat_system_seq_cst_ret_atomicrmw: 1357; GFX10-CU: ; %bb.0: ; %entry 1358; GFX10-CU-NEXT: s_clause 0x1 1359; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1360; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 1361; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1362; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1363; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 1364; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 1365; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1366; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 1367; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1368; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1369; GFX10-CU-NEXT: buffer_gl0_inv 1370; GFX10-CU-NEXT: buffer_gl1_inv 1371; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 1372; GFX10-CU-NEXT: s_endpgm 1373; 1374; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_ret_atomicrmw: 1375; SKIP-CACHE-INV: ; %bb.0: ; %entry 1376; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 1377; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 1378; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1379; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1380; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 1381; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 1382; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1383; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1384; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1385; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 1386; SKIP-CACHE-INV-NEXT: s_endpgm 1387; 1388; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_ret_atomicrmw: 1389; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 1390; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1391; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 1392; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1393; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1394; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 1395; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 1396; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1397; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1398; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1399; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 1400; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol 1401; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 1402; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 1403; 1404; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_ret_atomicrmw: 1405; GFX90A-TGSPLIT: ; %bb.0: ; %entry 1406; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1407; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 1408; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1409; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1410; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 1411; GFX90A-TGSPLIT-NEXT: buffer_wbl2 1412; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1413; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1414; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 1415; GFX90A-TGSPLIT-NEXT: buffer_invl2 1416; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 1417; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 1418; GFX90A-TGSPLIT-NEXT: s_endpgm 1419 i32* %out, i32 %in) { 1420entry: 1421 %val = atomicrmw volatile xchg i32* %out, i32 %in seq_cst 1422 store i32 %val, i32* %out, align 4 1423 ret void 1424} 1425 1426define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg( 1427; GFX7-LABEL: flat_system_monotonic_monotonic_cmpxchg: 1428; GFX7: ; %bb.0: ; %entry 1429; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1430; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 1431; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1432; GFX7-NEXT: s_add_u32 s0, s0, 16 1433; GFX7-NEXT: s_addc_u32 s1, s1, 0 1434; GFX7-NEXT: v_mov_b32_e32 v0, s0 1435; GFX7-NEXT: v_mov_b32_e32 v2, s2 1436; GFX7-NEXT: v_mov_b32_e32 v1, s1 1437; GFX7-NEXT: v_mov_b32_e32 v3, s3 1438; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1439; GFX7-NEXT: s_endpgm 1440; 1441; GFX10-WGP-LABEL: flat_system_monotonic_monotonic_cmpxchg: 1442; GFX10-WGP: ; %bb.0: ; %entry 1443; GFX10-WGP-NEXT: s_clause 0x1 1444; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1445; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1446; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1447; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 1448; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 1449; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1450; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 1451; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 1452; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 1453; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1454; GFX10-WGP-NEXT: s_endpgm 1455; 1456; GFX10-CU-LABEL: flat_system_monotonic_monotonic_cmpxchg: 1457; GFX10-CU: ; %bb.0: ; %entry 1458; GFX10-CU-NEXT: s_clause 0x1 1459; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1460; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1461; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1462; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 1463; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 1464; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1465; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 1466; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 1467; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 1468; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1469; GFX10-CU-NEXT: s_endpgm 1470; 1471; SKIP-CACHE-INV-LABEL: flat_system_monotonic_monotonic_cmpxchg: 1472; SKIP-CACHE-INV: ; %bb.0: ; %entry 1473; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 1474; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1475; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1476; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 1477; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 1478; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1479; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 1480; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 1481; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 1482; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1483; SKIP-CACHE-INV-NEXT: s_endpgm 1484; 1485; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_monotonic_cmpxchg: 1486; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 1487; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1488; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1489; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1490; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1491; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 1492; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 1493; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 1494; 1495; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_monotonic_cmpxchg: 1496; GFX90A-TGSPLIT: ; %bb.0: ; %entry 1497; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1498; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1499; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1500; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1501; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 1502; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 1503; GFX90A-TGSPLIT-NEXT: s_endpgm 1504 i32* %out, i32 %in, i32 %old) { 1505entry: 1506 %gep = getelementptr i32, i32* %out, i32 4 1507 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in monotonic monotonic 1508 ret void 1509} 1510 1511define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg( 1512; GFX7-LABEL: flat_system_acquire_monotonic_cmpxchg: 1513; GFX7: ; %bb.0: ; %entry 1514; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1515; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 1516; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1517; GFX7-NEXT: s_add_u32 s0, s0, 16 1518; GFX7-NEXT: s_addc_u32 s1, s1, 0 1519; GFX7-NEXT: v_mov_b32_e32 v0, s0 1520; GFX7-NEXT: v_mov_b32_e32 v2, s2 1521; GFX7-NEXT: v_mov_b32_e32 v1, s1 1522; GFX7-NEXT: v_mov_b32_e32 v3, s3 1523; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1524; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1525; GFX7-NEXT: buffer_wbinvl1_vol 1526; GFX7-NEXT: s_endpgm 1527; 1528; GFX10-WGP-LABEL: flat_system_acquire_monotonic_cmpxchg: 1529; GFX10-WGP: ; %bb.0: ; %entry 1530; GFX10-WGP-NEXT: s_clause 0x1 1531; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1532; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1533; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1534; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 1535; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 1536; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1537; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 1538; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 1539; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 1540; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1541; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1542; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 1543; GFX10-WGP-NEXT: buffer_gl0_inv 1544; GFX10-WGP-NEXT: buffer_gl1_inv 1545; GFX10-WGP-NEXT: s_endpgm 1546; 1547; GFX10-CU-LABEL: flat_system_acquire_monotonic_cmpxchg: 1548; GFX10-CU: ; %bb.0: ; %entry 1549; GFX10-CU-NEXT: s_clause 0x1 1550; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1551; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1552; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1553; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 1554; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 1555; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1556; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 1557; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 1558; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 1559; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1560; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1561; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 1562; GFX10-CU-NEXT: buffer_gl0_inv 1563; GFX10-CU-NEXT: buffer_gl1_inv 1564; GFX10-CU-NEXT: s_endpgm 1565; 1566; SKIP-CACHE-INV-LABEL: flat_system_acquire_monotonic_cmpxchg: 1567; SKIP-CACHE-INV: ; %bb.0: ; %entry 1568; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 1569; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1570; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1571; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 1572; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 1573; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1574; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 1575; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 1576; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 1577; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1578; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1579; SKIP-CACHE-INV-NEXT: s_endpgm 1580; 1581; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_monotonic_cmpxchg: 1582; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 1583; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1584; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1585; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1586; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1587; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 1588; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 1589; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1590; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 1591; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol 1592; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 1593; 1594; GFX90A-TGSPLIT-LABEL: flat_system_acquire_monotonic_cmpxchg: 1595; GFX90A-TGSPLIT: ; %bb.0: ; %entry 1596; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1597; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1598; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1599; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1600; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 1601; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 1602; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 1603; GFX90A-TGSPLIT-NEXT: buffer_invl2 1604; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 1605; GFX90A-TGSPLIT-NEXT: s_endpgm 1606 i32* %out, i32 %in, i32 %old) { 1607entry: 1608 %gep = getelementptr i32, i32* %out, i32 4 1609 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acquire monotonic 1610 ret void 1611} 1612 1613define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg( 1614; GFX7-LABEL: flat_system_release_monotonic_cmpxchg: 1615; GFX7: ; %bb.0: ; %entry 1616; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1617; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 1618; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1619; GFX7-NEXT: s_add_u32 s0, s0, 16 1620; GFX7-NEXT: s_addc_u32 s1, s1, 0 1621; GFX7-NEXT: v_mov_b32_e32 v0, s0 1622; GFX7-NEXT: v_mov_b32_e32 v2, s2 1623; GFX7-NEXT: v_mov_b32_e32 v1, s1 1624; GFX7-NEXT: v_mov_b32_e32 v3, s3 1625; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1626; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1627; GFX7-NEXT: s_endpgm 1628; 1629; GFX10-WGP-LABEL: flat_system_release_monotonic_cmpxchg: 1630; GFX10-WGP: ; %bb.0: ; %entry 1631; GFX10-WGP-NEXT: s_clause 0x1 1632; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1633; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1634; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1635; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 1636; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 1637; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1638; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 1639; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 1640; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 1641; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1642; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 1643; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1644; GFX10-WGP-NEXT: s_endpgm 1645; 1646; GFX10-CU-LABEL: flat_system_release_monotonic_cmpxchg: 1647; GFX10-CU: ; %bb.0: ; %entry 1648; GFX10-CU-NEXT: s_clause 0x1 1649; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1650; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1651; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1652; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 1653; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 1654; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1655; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 1656; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 1657; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 1658; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1659; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 1660; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1661; GFX10-CU-NEXT: s_endpgm 1662; 1663; SKIP-CACHE-INV-LABEL: flat_system_release_monotonic_cmpxchg: 1664; SKIP-CACHE-INV: ; %bb.0: ; %entry 1665; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 1666; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1667; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1668; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 1669; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 1670; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1671; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 1672; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 1673; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 1674; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1675; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1676; SKIP-CACHE-INV-NEXT: s_endpgm 1677; 1678; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_monotonic_cmpxchg: 1679; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 1680; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1681; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1682; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1683; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1684; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 1685; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 1686; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1687; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 1688; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 1689; 1690; GFX90A-TGSPLIT-LABEL: flat_system_release_monotonic_cmpxchg: 1691; GFX90A-TGSPLIT: ; %bb.0: ; %entry 1692; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1693; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1694; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1695; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1696; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 1697; GFX90A-TGSPLIT-NEXT: buffer_wbl2 1698; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1699; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 1700; GFX90A-TGSPLIT-NEXT: s_endpgm 1701 i32* %out, i32 %in, i32 %old) { 1702entry: 1703 %gep = getelementptr i32, i32* %out, i32 4 1704 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in release monotonic 1705 ret void 1706} 1707 1708define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg( 1709; GFX7-LABEL: flat_system_acq_rel_monotonic_cmpxchg: 1710; GFX7: ; %bb.0: ; %entry 1711; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1712; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 1713; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1714; GFX7-NEXT: s_add_u32 s0, s0, 16 1715; GFX7-NEXT: s_addc_u32 s1, s1, 0 1716; GFX7-NEXT: v_mov_b32_e32 v0, s0 1717; GFX7-NEXT: v_mov_b32_e32 v2, s2 1718; GFX7-NEXT: v_mov_b32_e32 v1, s1 1719; GFX7-NEXT: v_mov_b32_e32 v3, s3 1720; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1721; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1722; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1723; GFX7-NEXT: buffer_wbinvl1_vol 1724; GFX7-NEXT: s_endpgm 1725; 1726; GFX10-WGP-LABEL: flat_system_acq_rel_monotonic_cmpxchg: 1727; GFX10-WGP: ; %bb.0: ; %entry 1728; GFX10-WGP-NEXT: s_clause 0x1 1729; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1730; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1731; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1732; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 1733; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 1734; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1735; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 1736; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 1737; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 1738; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1739; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 1740; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1741; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1742; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 1743; GFX10-WGP-NEXT: buffer_gl0_inv 1744; GFX10-WGP-NEXT: buffer_gl1_inv 1745; GFX10-WGP-NEXT: s_endpgm 1746; 1747; GFX10-CU-LABEL: flat_system_acq_rel_monotonic_cmpxchg: 1748; GFX10-CU: ; %bb.0: ; %entry 1749; GFX10-CU-NEXT: s_clause 0x1 1750; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1751; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1752; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1753; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 1754; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 1755; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1756; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 1757; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 1758; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 1759; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1760; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 1761; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1762; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1763; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 1764; GFX10-CU-NEXT: buffer_gl0_inv 1765; GFX10-CU-NEXT: buffer_gl1_inv 1766; GFX10-CU-NEXT: s_endpgm 1767; 1768; SKIP-CACHE-INV-LABEL: flat_system_acq_rel_monotonic_cmpxchg: 1769; SKIP-CACHE-INV: ; %bb.0: ; %entry 1770; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 1771; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1772; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1773; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 1774; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 1775; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1776; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 1777; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 1778; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 1779; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1780; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1781; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1782; SKIP-CACHE-INV-NEXT: s_endpgm 1783; 1784; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_monotonic_cmpxchg: 1785; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 1786; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1787; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1788; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1789; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1790; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 1791; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 1792; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1793; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 1794; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1795; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 1796; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol 1797; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 1798; 1799; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_monotonic_cmpxchg: 1800; GFX90A-TGSPLIT: ; %bb.0: ; %entry 1801; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1802; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1803; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1804; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1805; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 1806; GFX90A-TGSPLIT-NEXT: buffer_wbl2 1807; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1808; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 1809; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 1810; GFX90A-TGSPLIT-NEXT: buffer_invl2 1811; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 1812; GFX90A-TGSPLIT-NEXT: s_endpgm 1813 i32* %out, i32 %in, i32 %old) { 1814entry: 1815 %gep = getelementptr i32, i32* %out, i32 4 1816 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acq_rel monotonic 1817 ret void 1818} 1819 1820define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg( 1821; GFX7-LABEL: flat_system_seq_cst_monotonic_cmpxchg: 1822; GFX7: ; %bb.0: ; %entry 1823; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1824; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 1825; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1826; GFX7-NEXT: s_add_u32 s0, s0, 16 1827; GFX7-NEXT: s_addc_u32 s1, s1, 0 1828; GFX7-NEXT: v_mov_b32_e32 v0, s0 1829; GFX7-NEXT: v_mov_b32_e32 v2, s2 1830; GFX7-NEXT: v_mov_b32_e32 v1, s1 1831; GFX7-NEXT: v_mov_b32_e32 v3, s3 1832; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1833; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1834; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1835; GFX7-NEXT: buffer_wbinvl1_vol 1836; GFX7-NEXT: s_endpgm 1837; 1838; GFX10-WGP-LABEL: flat_system_seq_cst_monotonic_cmpxchg: 1839; GFX10-WGP: ; %bb.0: ; %entry 1840; GFX10-WGP-NEXT: s_clause 0x1 1841; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1842; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1843; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1844; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 1845; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 1846; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1847; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 1848; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 1849; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 1850; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1851; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 1852; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1853; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1854; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 1855; GFX10-WGP-NEXT: buffer_gl0_inv 1856; GFX10-WGP-NEXT: buffer_gl1_inv 1857; GFX10-WGP-NEXT: s_endpgm 1858; 1859; GFX10-CU-LABEL: flat_system_seq_cst_monotonic_cmpxchg: 1860; GFX10-CU: ; %bb.0: ; %entry 1861; GFX10-CU-NEXT: s_clause 0x1 1862; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1863; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1864; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1865; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 1866; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 1867; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1868; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 1869; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 1870; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 1871; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1872; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 1873; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1874; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1875; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 1876; GFX10-CU-NEXT: buffer_gl0_inv 1877; GFX10-CU-NEXT: buffer_gl1_inv 1878; GFX10-CU-NEXT: s_endpgm 1879; 1880; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_monotonic_cmpxchg: 1881; SKIP-CACHE-INV: ; %bb.0: ; %entry 1882; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 1883; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1884; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1885; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 1886; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 1887; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1888; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 1889; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 1890; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 1891; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1892; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1893; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1894; SKIP-CACHE-INV-NEXT: s_endpgm 1895; 1896; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_monotonic_cmpxchg: 1897; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 1898; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1899; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1900; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1901; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1902; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 1903; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 1904; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1905; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 1906; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1907; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 1908; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol 1909; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 1910; 1911; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_monotonic_cmpxchg: 1912; GFX90A-TGSPLIT: ; %bb.0: ; %entry 1913; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1914; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1915; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1916; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1917; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 1918; GFX90A-TGSPLIT-NEXT: buffer_wbl2 1919; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1920; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 1921; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 1922; GFX90A-TGSPLIT-NEXT: buffer_invl2 1923; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 1924; GFX90A-TGSPLIT-NEXT: s_endpgm 1925 i32* %out, i32 %in, i32 %old) { 1926entry: 1927 %gep = getelementptr i32, i32* %out, i32 4 1928 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in seq_cst monotonic 1929 ret void 1930} 1931 1932define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg( 1933; GFX7-LABEL: flat_system_monotonic_acquire_cmpxchg: 1934; GFX7: ; %bb.0: ; %entry 1935; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1936; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 1937; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1938; GFX7-NEXT: s_add_u32 s0, s0, 16 1939; GFX7-NEXT: s_addc_u32 s1, s1, 0 1940; GFX7-NEXT: v_mov_b32_e32 v0, s0 1941; GFX7-NEXT: v_mov_b32_e32 v2, s2 1942; GFX7-NEXT: v_mov_b32_e32 v1, s1 1943; GFX7-NEXT: v_mov_b32_e32 v3, s3 1944; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1945; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1946; GFX7-NEXT: buffer_wbinvl1_vol 1947; GFX7-NEXT: s_endpgm 1948; 1949; GFX10-WGP-LABEL: flat_system_monotonic_acquire_cmpxchg: 1950; GFX10-WGP: ; %bb.0: ; %entry 1951; GFX10-WGP-NEXT: s_clause 0x1 1952; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1953; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1954; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1955; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 1956; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 1957; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1958; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 1959; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 1960; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 1961; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1962; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1963; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 1964; GFX10-WGP-NEXT: buffer_gl0_inv 1965; GFX10-WGP-NEXT: buffer_gl1_inv 1966; GFX10-WGP-NEXT: s_endpgm 1967; 1968; GFX10-CU-LABEL: flat_system_monotonic_acquire_cmpxchg: 1969; GFX10-CU: ; %bb.0: ; %entry 1970; GFX10-CU-NEXT: s_clause 0x1 1971; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1972; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1973; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1974; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 1975; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 1976; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1977; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 1978; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 1979; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 1980; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1981; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1982; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 1983; GFX10-CU-NEXT: buffer_gl0_inv 1984; GFX10-CU-NEXT: buffer_gl1_inv 1985; GFX10-CU-NEXT: s_endpgm 1986; 1987; SKIP-CACHE-INV-LABEL: flat_system_monotonic_acquire_cmpxchg: 1988; SKIP-CACHE-INV: ; %bb.0: ; %entry 1989; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 1990; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1991; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1992; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 1993; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 1994; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1995; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 1996; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 1997; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 1998; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1999; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2000; SKIP-CACHE-INV-NEXT: s_endpgm 2001; 2002; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_acquire_cmpxchg: 2003; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 2004; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2005; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2006; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2007; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2008; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2009; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2010; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2011; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 2012; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol 2013; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 2014; 2015; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_acquire_cmpxchg: 2016; GFX90A-TGSPLIT: ; %bb.0: ; %entry 2017; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2018; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2019; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2020; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2021; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2022; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2023; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 2024; GFX90A-TGSPLIT-NEXT: buffer_invl2 2025; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 2026; GFX90A-TGSPLIT-NEXT: s_endpgm 2027 i32* %out, i32 %in, i32 %old) { 2028entry: 2029 %gep = getelementptr i32, i32* %out, i32 4 2030 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in monotonic acquire 2031 ret void 2032} 2033 2034define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg( 2035; GFX7-LABEL: flat_system_acquire_acquire_cmpxchg: 2036; GFX7: ; %bb.0: ; %entry 2037; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2038; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 2039; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2040; GFX7-NEXT: s_add_u32 s0, s0, 16 2041; GFX7-NEXT: s_addc_u32 s1, s1, 0 2042; GFX7-NEXT: v_mov_b32_e32 v0, s0 2043; GFX7-NEXT: v_mov_b32_e32 v2, s2 2044; GFX7-NEXT: v_mov_b32_e32 v1, s1 2045; GFX7-NEXT: v_mov_b32_e32 v3, s3 2046; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2047; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2048; GFX7-NEXT: buffer_wbinvl1_vol 2049; GFX7-NEXT: s_endpgm 2050; 2051; GFX10-WGP-LABEL: flat_system_acquire_acquire_cmpxchg: 2052; GFX10-WGP: ; %bb.0: ; %entry 2053; GFX10-WGP-NEXT: s_clause 0x1 2054; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2055; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2056; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2057; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 2058; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 2059; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 2060; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 2061; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 2062; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 2063; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2064; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2065; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 2066; GFX10-WGP-NEXT: buffer_gl0_inv 2067; GFX10-WGP-NEXT: buffer_gl1_inv 2068; GFX10-WGP-NEXT: s_endpgm 2069; 2070; GFX10-CU-LABEL: flat_system_acquire_acquire_cmpxchg: 2071; GFX10-CU: ; %bb.0: ; %entry 2072; GFX10-CU-NEXT: s_clause 0x1 2073; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2074; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2075; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2076; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 2077; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 2078; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 2079; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 2080; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 2081; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 2082; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2083; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2084; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 2085; GFX10-CU-NEXT: buffer_gl0_inv 2086; GFX10-CU-NEXT: buffer_gl1_inv 2087; GFX10-CU-NEXT: s_endpgm 2088; 2089; SKIP-CACHE-INV-LABEL: flat_system_acquire_acquire_cmpxchg: 2090; SKIP-CACHE-INV: ; %bb.0: ; %entry 2091; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 2092; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 2093; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2094; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 2095; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 2096; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 2097; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 2098; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 2099; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 2100; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2101; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2102; SKIP-CACHE-INV-NEXT: s_endpgm 2103; 2104; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_acquire_cmpxchg: 2105; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 2106; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2107; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2108; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2109; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2110; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2111; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2112; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2113; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 2114; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol 2115; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 2116; 2117; GFX90A-TGSPLIT-LABEL: flat_system_acquire_acquire_cmpxchg: 2118; GFX90A-TGSPLIT: ; %bb.0: ; %entry 2119; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2120; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2121; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2122; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2123; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2124; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2125; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 2126; GFX90A-TGSPLIT-NEXT: buffer_invl2 2127; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 2128; GFX90A-TGSPLIT-NEXT: s_endpgm 2129 i32* %out, i32 %in, i32 %old) { 2130entry: 2131 %gep = getelementptr i32, i32* %out, i32 4 2132 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acquire acquire 2133 ret void 2134} 2135 2136define amdgpu_kernel void @flat_system_release_acquire_cmpxchg( 2137; GFX7-LABEL: flat_system_release_acquire_cmpxchg: 2138; GFX7: ; %bb.0: ; %entry 2139; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2140; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 2141; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2142; GFX7-NEXT: s_add_u32 s0, s0, 16 2143; GFX7-NEXT: s_addc_u32 s1, s1, 0 2144; GFX7-NEXT: v_mov_b32_e32 v0, s0 2145; GFX7-NEXT: v_mov_b32_e32 v2, s2 2146; GFX7-NEXT: v_mov_b32_e32 v1, s1 2147; GFX7-NEXT: v_mov_b32_e32 v3, s3 2148; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2149; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2150; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2151; GFX7-NEXT: buffer_wbinvl1_vol 2152; GFX7-NEXT: s_endpgm 2153; 2154; GFX10-WGP-LABEL: flat_system_release_acquire_cmpxchg: 2155; GFX10-WGP: ; %bb.0: ; %entry 2156; GFX10-WGP-NEXT: s_clause 0x1 2157; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2158; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2159; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2160; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 2161; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 2162; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 2163; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 2164; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 2165; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 2166; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2167; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 2168; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2169; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2170; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 2171; GFX10-WGP-NEXT: buffer_gl0_inv 2172; GFX10-WGP-NEXT: buffer_gl1_inv 2173; GFX10-WGP-NEXT: s_endpgm 2174; 2175; GFX10-CU-LABEL: flat_system_release_acquire_cmpxchg: 2176; GFX10-CU: ; %bb.0: ; %entry 2177; GFX10-CU-NEXT: s_clause 0x1 2178; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2179; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2180; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2181; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 2182; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 2183; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 2184; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 2185; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 2186; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 2187; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2188; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 2189; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2190; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2191; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 2192; GFX10-CU-NEXT: buffer_gl0_inv 2193; GFX10-CU-NEXT: buffer_gl1_inv 2194; GFX10-CU-NEXT: s_endpgm 2195; 2196; SKIP-CACHE-INV-LABEL: flat_system_release_acquire_cmpxchg: 2197; SKIP-CACHE-INV: ; %bb.0: ; %entry 2198; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 2199; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 2200; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2201; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 2202; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 2203; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 2204; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 2205; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 2206; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 2207; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2208; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2209; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2210; SKIP-CACHE-INV-NEXT: s_endpgm 2211; 2212; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_acquire_cmpxchg: 2213; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 2214; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2215; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2216; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2217; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2218; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2219; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 2220; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2221; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2222; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2223; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 2224; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol 2225; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 2226; 2227; GFX90A-TGSPLIT-LABEL: flat_system_release_acquire_cmpxchg: 2228; GFX90A-TGSPLIT: ; %bb.0: ; %entry 2229; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2230; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2231; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2232; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2233; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2234; GFX90A-TGSPLIT-NEXT: buffer_wbl2 2235; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2236; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2237; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 2238; GFX90A-TGSPLIT-NEXT: buffer_invl2 2239; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 2240; GFX90A-TGSPLIT-NEXT: s_endpgm 2241 i32* %out, i32 %in, i32 %old) { 2242entry: 2243 %gep = getelementptr i32, i32* %out, i32 4 2244 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in release acquire 2245 ret void 2246} 2247 2248define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg( 2249; GFX7-LABEL: flat_system_acq_rel_acquire_cmpxchg: 2250; GFX7: ; %bb.0: ; %entry 2251; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2252; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 2253; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2254; GFX7-NEXT: s_add_u32 s0, s0, 16 2255; GFX7-NEXT: s_addc_u32 s1, s1, 0 2256; GFX7-NEXT: v_mov_b32_e32 v0, s0 2257; GFX7-NEXT: v_mov_b32_e32 v2, s2 2258; GFX7-NEXT: v_mov_b32_e32 v1, s1 2259; GFX7-NEXT: v_mov_b32_e32 v3, s3 2260; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2261; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2262; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2263; GFX7-NEXT: buffer_wbinvl1_vol 2264; GFX7-NEXT: s_endpgm 2265; 2266; GFX10-WGP-LABEL: flat_system_acq_rel_acquire_cmpxchg: 2267; GFX10-WGP: ; %bb.0: ; %entry 2268; GFX10-WGP-NEXT: s_clause 0x1 2269; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2270; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2271; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2272; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 2273; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 2274; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 2275; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 2276; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 2277; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 2278; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2279; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 2280; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2281; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2282; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 2283; GFX10-WGP-NEXT: buffer_gl0_inv 2284; GFX10-WGP-NEXT: buffer_gl1_inv 2285; GFX10-WGP-NEXT: s_endpgm 2286; 2287; GFX10-CU-LABEL: flat_system_acq_rel_acquire_cmpxchg: 2288; GFX10-CU: ; %bb.0: ; %entry 2289; GFX10-CU-NEXT: s_clause 0x1 2290; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2291; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2292; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2293; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 2294; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 2295; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 2296; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 2297; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 2298; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 2299; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2300; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 2301; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2302; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2303; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 2304; GFX10-CU-NEXT: buffer_gl0_inv 2305; GFX10-CU-NEXT: buffer_gl1_inv 2306; GFX10-CU-NEXT: s_endpgm 2307; 2308; SKIP-CACHE-INV-LABEL: flat_system_acq_rel_acquire_cmpxchg: 2309; SKIP-CACHE-INV: ; %bb.0: ; %entry 2310; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 2311; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 2312; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2313; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 2314; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 2315; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 2316; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 2317; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 2318; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 2319; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2320; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2321; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2322; SKIP-CACHE-INV-NEXT: s_endpgm 2323; 2324; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_acquire_cmpxchg: 2325; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 2326; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2327; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2328; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2329; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2330; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2331; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 2332; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2333; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2334; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2335; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 2336; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol 2337; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 2338; 2339; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_acquire_cmpxchg: 2340; GFX90A-TGSPLIT: ; %bb.0: ; %entry 2341; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2342; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2343; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2344; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2345; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2346; GFX90A-TGSPLIT-NEXT: buffer_wbl2 2347; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2348; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2349; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 2350; GFX90A-TGSPLIT-NEXT: buffer_invl2 2351; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 2352; GFX90A-TGSPLIT-NEXT: s_endpgm 2353 i32* %out, i32 %in, i32 %old) { 2354entry: 2355 %gep = getelementptr i32, i32* %out, i32 4 2356 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acq_rel acquire 2357 ret void 2358} 2359 2360define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg( 2361; GFX7-LABEL: flat_system_seq_cst_acquire_cmpxchg: 2362; GFX7: ; %bb.0: ; %entry 2363; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2364; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 2365; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2366; GFX7-NEXT: s_add_u32 s0, s0, 16 2367; GFX7-NEXT: s_addc_u32 s1, s1, 0 2368; GFX7-NEXT: v_mov_b32_e32 v0, s0 2369; GFX7-NEXT: v_mov_b32_e32 v2, s2 2370; GFX7-NEXT: v_mov_b32_e32 v1, s1 2371; GFX7-NEXT: v_mov_b32_e32 v3, s3 2372; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2373; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2374; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2375; GFX7-NEXT: buffer_wbinvl1_vol 2376; GFX7-NEXT: s_endpgm 2377; 2378; GFX10-WGP-LABEL: flat_system_seq_cst_acquire_cmpxchg: 2379; GFX10-WGP: ; %bb.0: ; %entry 2380; GFX10-WGP-NEXT: s_clause 0x1 2381; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2382; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2383; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2384; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 2385; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 2386; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 2387; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 2388; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 2389; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 2390; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2391; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 2392; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2393; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2394; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 2395; GFX10-WGP-NEXT: buffer_gl0_inv 2396; GFX10-WGP-NEXT: buffer_gl1_inv 2397; GFX10-WGP-NEXT: s_endpgm 2398; 2399; GFX10-CU-LABEL: flat_system_seq_cst_acquire_cmpxchg: 2400; GFX10-CU: ; %bb.0: ; %entry 2401; GFX10-CU-NEXT: s_clause 0x1 2402; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2403; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2404; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2405; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 2406; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 2407; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 2408; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 2409; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 2410; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 2411; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2412; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 2413; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2414; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2415; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 2416; GFX10-CU-NEXT: buffer_gl0_inv 2417; GFX10-CU-NEXT: buffer_gl1_inv 2418; GFX10-CU-NEXT: s_endpgm 2419; 2420; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_acquire_cmpxchg: 2421; SKIP-CACHE-INV: ; %bb.0: ; %entry 2422; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 2423; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 2424; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2425; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 2426; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 2427; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 2428; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 2429; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 2430; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 2431; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2432; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2433; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2434; SKIP-CACHE-INV-NEXT: s_endpgm 2435; 2436; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_acquire_cmpxchg: 2437; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 2438; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2439; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2440; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2441; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2442; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2443; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 2444; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2445; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2446; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2447; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 2448; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol 2449; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 2450; 2451; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_acquire_cmpxchg: 2452; GFX90A-TGSPLIT: ; %bb.0: ; %entry 2453; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2454; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2455; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2456; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2457; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2458; GFX90A-TGSPLIT-NEXT: buffer_wbl2 2459; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2460; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2461; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 2462; GFX90A-TGSPLIT-NEXT: buffer_invl2 2463; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 2464; GFX90A-TGSPLIT-NEXT: s_endpgm 2465 i32* %out, i32 %in, i32 %old) { 2466entry: 2467 %gep = getelementptr i32, i32* %out, i32 4 2468 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in seq_cst acquire 2469 ret void 2470} 2471 2472define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg( 2473; GFX7-LABEL: flat_system_monotonic_seq_cst_cmpxchg: 2474; GFX7: ; %bb.0: ; %entry 2475; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2476; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 2477; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2478; GFX7-NEXT: s_add_u32 s0, s0, 16 2479; GFX7-NEXT: s_addc_u32 s1, s1, 0 2480; GFX7-NEXT: v_mov_b32_e32 v0, s0 2481; GFX7-NEXT: v_mov_b32_e32 v2, s2 2482; GFX7-NEXT: v_mov_b32_e32 v1, s1 2483; GFX7-NEXT: v_mov_b32_e32 v3, s3 2484; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2485; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2486; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2487; GFX7-NEXT: buffer_wbinvl1_vol 2488; GFX7-NEXT: s_endpgm 2489; 2490; GFX10-WGP-LABEL: flat_system_monotonic_seq_cst_cmpxchg: 2491; GFX10-WGP: ; %bb.0: ; %entry 2492; GFX10-WGP-NEXT: s_clause 0x1 2493; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2494; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2495; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2496; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 2497; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 2498; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 2499; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 2500; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 2501; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 2502; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2503; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 2504; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2505; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2506; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 2507; GFX10-WGP-NEXT: buffer_gl0_inv 2508; GFX10-WGP-NEXT: buffer_gl1_inv 2509; GFX10-WGP-NEXT: s_endpgm 2510; 2511; GFX10-CU-LABEL: flat_system_monotonic_seq_cst_cmpxchg: 2512; GFX10-CU: ; %bb.0: ; %entry 2513; GFX10-CU-NEXT: s_clause 0x1 2514; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2515; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2516; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2517; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 2518; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 2519; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 2520; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 2521; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 2522; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 2523; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2524; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 2525; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2526; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2527; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 2528; GFX10-CU-NEXT: buffer_gl0_inv 2529; GFX10-CU-NEXT: buffer_gl1_inv 2530; GFX10-CU-NEXT: s_endpgm 2531; 2532; SKIP-CACHE-INV-LABEL: flat_system_monotonic_seq_cst_cmpxchg: 2533; SKIP-CACHE-INV: ; %bb.0: ; %entry 2534; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 2535; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 2536; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2537; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 2538; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 2539; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 2540; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 2541; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 2542; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 2543; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2544; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2545; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2546; SKIP-CACHE-INV-NEXT: s_endpgm 2547; 2548; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_seq_cst_cmpxchg: 2549; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 2550; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2551; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2552; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2553; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2554; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2555; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 2556; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2557; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2558; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2559; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 2560; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol 2561; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 2562; 2563; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_seq_cst_cmpxchg: 2564; GFX90A-TGSPLIT: ; %bb.0: ; %entry 2565; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2566; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2567; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2568; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2569; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2570; GFX90A-TGSPLIT-NEXT: buffer_wbl2 2571; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2572; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2573; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 2574; GFX90A-TGSPLIT-NEXT: buffer_invl2 2575; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 2576; GFX90A-TGSPLIT-NEXT: s_endpgm 2577 i32* %out, i32 %in, i32 %old) { 2578entry: 2579 %gep = getelementptr i32, i32* %out, i32 4 2580 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in monotonic seq_cst 2581 ret void 2582} 2583 2584define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg( 2585; GFX7-LABEL: flat_system_acquire_seq_cst_cmpxchg: 2586; GFX7: ; %bb.0: ; %entry 2587; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2588; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 2589; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2590; GFX7-NEXT: s_add_u32 s0, s0, 16 2591; GFX7-NEXT: s_addc_u32 s1, s1, 0 2592; GFX7-NEXT: v_mov_b32_e32 v0, s0 2593; GFX7-NEXT: v_mov_b32_e32 v2, s2 2594; GFX7-NEXT: v_mov_b32_e32 v1, s1 2595; GFX7-NEXT: v_mov_b32_e32 v3, s3 2596; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2597; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2598; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2599; GFX7-NEXT: buffer_wbinvl1_vol 2600; GFX7-NEXT: s_endpgm 2601; 2602; GFX10-WGP-LABEL: flat_system_acquire_seq_cst_cmpxchg: 2603; GFX10-WGP: ; %bb.0: ; %entry 2604; GFX10-WGP-NEXT: s_clause 0x1 2605; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2606; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2607; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2608; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 2609; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 2610; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 2611; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 2612; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 2613; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 2614; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2615; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 2616; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2617; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2618; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 2619; GFX10-WGP-NEXT: buffer_gl0_inv 2620; GFX10-WGP-NEXT: buffer_gl1_inv 2621; GFX10-WGP-NEXT: s_endpgm 2622; 2623; GFX10-CU-LABEL: flat_system_acquire_seq_cst_cmpxchg: 2624; GFX10-CU: ; %bb.0: ; %entry 2625; GFX10-CU-NEXT: s_clause 0x1 2626; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2627; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2628; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2629; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 2630; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 2631; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 2632; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 2633; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 2634; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 2635; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2636; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 2637; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2638; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2639; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 2640; GFX10-CU-NEXT: buffer_gl0_inv 2641; GFX10-CU-NEXT: buffer_gl1_inv 2642; GFX10-CU-NEXT: s_endpgm 2643; 2644; SKIP-CACHE-INV-LABEL: flat_system_acquire_seq_cst_cmpxchg: 2645; SKIP-CACHE-INV: ; %bb.0: ; %entry 2646; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 2647; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 2648; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2649; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 2650; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 2651; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 2652; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 2653; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 2654; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 2655; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2656; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2657; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2658; SKIP-CACHE-INV-NEXT: s_endpgm 2659; 2660; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_seq_cst_cmpxchg: 2661; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 2662; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2663; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2664; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2665; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2666; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2667; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 2668; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2669; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2670; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2671; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 2672; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol 2673; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 2674; 2675; GFX90A-TGSPLIT-LABEL: flat_system_acquire_seq_cst_cmpxchg: 2676; GFX90A-TGSPLIT: ; %bb.0: ; %entry 2677; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2678; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2679; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2680; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2681; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2682; GFX90A-TGSPLIT-NEXT: buffer_wbl2 2683; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2684; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2685; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 2686; GFX90A-TGSPLIT-NEXT: buffer_invl2 2687; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 2688; GFX90A-TGSPLIT-NEXT: s_endpgm 2689 i32* %out, i32 %in, i32 %old) { 2690entry: 2691 %gep = getelementptr i32, i32* %out, i32 4 2692 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acquire seq_cst 2693 ret void 2694} 2695 2696define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg( 2697; GFX7-LABEL: flat_system_release_seq_cst_cmpxchg: 2698; GFX7: ; %bb.0: ; %entry 2699; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2700; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 2701; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2702; GFX7-NEXT: s_add_u32 s0, s0, 16 2703; GFX7-NEXT: s_addc_u32 s1, s1, 0 2704; GFX7-NEXT: v_mov_b32_e32 v0, s0 2705; GFX7-NEXT: v_mov_b32_e32 v2, s2 2706; GFX7-NEXT: v_mov_b32_e32 v1, s1 2707; GFX7-NEXT: v_mov_b32_e32 v3, s3 2708; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2709; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2710; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2711; GFX7-NEXT: buffer_wbinvl1_vol 2712; GFX7-NEXT: s_endpgm 2713; 2714; GFX10-WGP-LABEL: flat_system_release_seq_cst_cmpxchg: 2715; GFX10-WGP: ; %bb.0: ; %entry 2716; GFX10-WGP-NEXT: s_clause 0x1 2717; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2718; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2719; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2720; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 2721; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 2722; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 2723; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 2724; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 2725; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 2726; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2727; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 2728; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2729; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2730; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 2731; GFX10-WGP-NEXT: buffer_gl0_inv 2732; GFX10-WGP-NEXT: buffer_gl1_inv 2733; GFX10-WGP-NEXT: s_endpgm 2734; 2735; GFX10-CU-LABEL: flat_system_release_seq_cst_cmpxchg: 2736; GFX10-CU: ; %bb.0: ; %entry 2737; GFX10-CU-NEXT: s_clause 0x1 2738; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2739; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2740; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2741; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 2742; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 2743; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 2744; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 2745; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 2746; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 2747; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2748; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 2749; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2750; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2751; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 2752; GFX10-CU-NEXT: buffer_gl0_inv 2753; GFX10-CU-NEXT: buffer_gl1_inv 2754; GFX10-CU-NEXT: s_endpgm 2755; 2756; SKIP-CACHE-INV-LABEL: flat_system_release_seq_cst_cmpxchg: 2757; SKIP-CACHE-INV: ; %bb.0: ; %entry 2758; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 2759; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 2760; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2761; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 2762; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 2763; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 2764; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 2765; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 2766; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 2767; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2768; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2769; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2770; SKIP-CACHE-INV-NEXT: s_endpgm 2771; 2772; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_seq_cst_cmpxchg: 2773; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 2774; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2775; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2776; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2777; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2778; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2779; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 2780; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2781; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2782; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2783; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 2784; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol 2785; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 2786; 2787; GFX90A-TGSPLIT-LABEL: flat_system_release_seq_cst_cmpxchg: 2788; GFX90A-TGSPLIT: ; %bb.0: ; %entry 2789; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2790; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2791; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2792; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2793; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2794; GFX90A-TGSPLIT-NEXT: buffer_wbl2 2795; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2796; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2797; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 2798; GFX90A-TGSPLIT-NEXT: buffer_invl2 2799; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 2800; GFX90A-TGSPLIT-NEXT: s_endpgm 2801 i32* %out, i32 %in, i32 %old) { 2802entry: 2803 %gep = getelementptr i32, i32* %out, i32 4 2804 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in release seq_cst 2805 ret void 2806} 2807 2808define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg( 2809; GFX7-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: 2810; GFX7: ; %bb.0: ; %entry 2811; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2812; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 2813; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2814; GFX7-NEXT: s_add_u32 s0, s0, 16 2815; GFX7-NEXT: s_addc_u32 s1, s1, 0 2816; GFX7-NEXT: v_mov_b32_e32 v0, s0 2817; GFX7-NEXT: v_mov_b32_e32 v2, s2 2818; GFX7-NEXT: v_mov_b32_e32 v1, s1 2819; GFX7-NEXT: v_mov_b32_e32 v3, s3 2820; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2821; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2822; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2823; GFX7-NEXT: buffer_wbinvl1_vol 2824; GFX7-NEXT: s_endpgm 2825; 2826; GFX10-WGP-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: 2827; GFX10-WGP: ; %bb.0: ; %entry 2828; GFX10-WGP-NEXT: s_clause 0x1 2829; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2830; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2831; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2832; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 2833; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 2834; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 2835; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 2836; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 2837; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 2838; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2839; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 2840; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2841; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2842; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 2843; GFX10-WGP-NEXT: buffer_gl0_inv 2844; GFX10-WGP-NEXT: buffer_gl1_inv 2845; GFX10-WGP-NEXT: s_endpgm 2846; 2847; GFX10-CU-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: 2848; GFX10-CU: ; %bb.0: ; %entry 2849; GFX10-CU-NEXT: s_clause 0x1 2850; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2851; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2852; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2853; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 2854; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 2855; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 2856; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 2857; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 2858; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 2859; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2860; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 2861; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2862; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2863; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 2864; GFX10-CU-NEXT: buffer_gl0_inv 2865; GFX10-CU-NEXT: buffer_gl1_inv 2866; GFX10-CU-NEXT: s_endpgm 2867; 2868; SKIP-CACHE-INV-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: 2869; SKIP-CACHE-INV: ; %bb.0: ; %entry 2870; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 2871; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 2872; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2873; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 2874; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 2875; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 2876; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 2877; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 2878; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 2879; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2880; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2881; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2882; SKIP-CACHE-INV-NEXT: s_endpgm 2883; 2884; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: 2885; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 2886; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2887; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2888; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2889; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2890; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2891; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 2892; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2893; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2894; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2895; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 2896; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol 2897; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 2898; 2899; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: 2900; GFX90A-TGSPLIT: ; %bb.0: ; %entry 2901; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2902; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2903; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2904; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2905; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2906; GFX90A-TGSPLIT-NEXT: buffer_wbl2 2907; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2908; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2909; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 2910; GFX90A-TGSPLIT-NEXT: buffer_invl2 2911; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 2912; GFX90A-TGSPLIT-NEXT: s_endpgm 2913 i32* %out, i32 %in, i32 %old) { 2914entry: 2915 %gep = getelementptr i32, i32* %out, i32 4 2916 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acq_rel seq_cst 2917 ret void 2918} 2919 2920define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg( 2921; GFX7-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: 2922; GFX7: ; %bb.0: ; %entry 2923; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2924; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 2925; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2926; GFX7-NEXT: s_add_u32 s0, s0, 16 2927; GFX7-NEXT: s_addc_u32 s1, s1, 0 2928; GFX7-NEXT: v_mov_b32_e32 v0, s0 2929; GFX7-NEXT: v_mov_b32_e32 v2, s2 2930; GFX7-NEXT: v_mov_b32_e32 v1, s1 2931; GFX7-NEXT: v_mov_b32_e32 v3, s3 2932; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2933; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2934; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2935; GFX7-NEXT: buffer_wbinvl1_vol 2936; GFX7-NEXT: s_endpgm 2937; 2938; GFX10-WGP-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: 2939; GFX10-WGP: ; %bb.0: ; %entry 2940; GFX10-WGP-NEXT: s_clause 0x1 2941; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2942; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2943; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2944; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 2945; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 2946; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 2947; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 2948; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 2949; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 2950; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2951; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 2952; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2953; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2954; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 2955; GFX10-WGP-NEXT: buffer_gl0_inv 2956; GFX10-WGP-NEXT: buffer_gl1_inv 2957; GFX10-WGP-NEXT: s_endpgm 2958; 2959; GFX10-CU-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: 2960; GFX10-CU: ; %bb.0: ; %entry 2961; GFX10-CU-NEXT: s_clause 0x1 2962; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2963; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2964; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2965; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 2966; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 2967; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 2968; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 2969; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 2970; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 2971; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2972; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 2973; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2974; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2975; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 2976; GFX10-CU-NEXT: buffer_gl0_inv 2977; GFX10-CU-NEXT: buffer_gl1_inv 2978; GFX10-CU-NEXT: s_endpgm 2979; 2980; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: 2981; SKIP-CACHE-INV: ; %bb.0: ; %entry 2982; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 2983; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 2984; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2985; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 2986; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 2987; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 2988; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 2989; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 2990; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 2991; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2992; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2993; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2994; SKIP-CACHE-INV-NEXT: s_endpgm 2995; 2996; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: 2997; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 2998; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2999; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3000; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3001; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 3002; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 3003; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 3004; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3005; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 3006; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3007; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 3008; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol 3009; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 3010; 3011; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: 3012; GFX90A-TGSPLIT: ; %bb.0: ; %entry 3013; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3014; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3015; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3016; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 3017; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 3018; GFX90A-TGSPLIT-NEXT: buffer_wbl2 3019; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3020; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 3021; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 3022; GFX90A-TGSPLIT-NEXT: buffer_invl2 3023; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 3024; GFX90A-TGSPLIT-NEXT: s_endpgm 3025 i32* %out, i32 %in, i32 %old) { 3026entry: 3027 %gep = getelementptr i32, i32* %out, i32 4 3028 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in seq_cst seq_cst 3029 ret void 3030} 3031 3032define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg( 3033; GFX7-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: 3034; GFX7: ; %bb.0: ; %entry 3035; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3036; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 3037; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3038; GFX7-NEXT: s_add_u32 s4, s0, 16 3039; GFX7-NEXT: s_addc_u32 s5, s1, 0 3040; GFX7-NEXT: v_mov_b32_e32 v0, s4 3041; GFX7-NEXT: v_mov_b32_e32 v2, s2 3042; GFX7-NEXT: v_mov_b32_e32 v1, s5 3043; GFX7-NEXT: v_mov_b32_e32 v3, s3 3044; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3045; GFX7-NEXT: v_mov_b32_e32 v0, s0 3046; GFX7-NEXT: v_mov_b32_e32 v1, s1 3047; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3048; GFX7-NEXT: flat_store_dword v[0:1], v2 3049; GFX7-NEXT: s_endpgm 3050; 3051; GFX10-WGP-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: 3052; GFX10-WGP: ; %bb.0: ; %entry 3053; GFX10-WGP-NEXT: s_clause 0x1 3054; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3055; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3056; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3057; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 3058; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 3059; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 3060; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 3061; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 3062; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 3063; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3064; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 3065; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 3066; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3067; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 3068; GFX10-WGP-NEXT: s_endpgm 3069; 3070; GFX10-CU-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: 3071; GFX10-CU: ; %bb.0: ; %entry 3072; GFX10-CU-NEXT: s_clause 0x1 3073; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3074; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3075; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3076; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 3077; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 3078; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 3079; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 3080; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 3081; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 3082; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3083; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 3084; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 3085; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3086; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 3087; GFX10-CU-NEXT: s_endpgm 3088; 3089; SKIP-CACHE-INV-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: 3090; SKIP-CACHE-INV: ; %bb.0: ; %entry 3091; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 3092; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 3093; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3094; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 3095; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 3096; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 3097; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 3098; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 3099; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 3100; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3101; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 3102; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 3103; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3104; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 3105; SKIP-CACHE-INV-NEXT: s_endpgm 3106; 3107; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: 3108; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 3109; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3110; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3111; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3112; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 3113; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 3114; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 3115; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3116; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 3117; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 3118; 3119; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: 3120; GFX90A-TGSPLIT: ; %bb.0: ; %entry 3121; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3122; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3123; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3124; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 3125; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 3126; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 3127; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 3128; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 3129; GFX90A-TGSPLIT-NEXT: s_endpgm 3130 i32* %out, i32 %in, i32 %old) { 3131entry: 3132 %gep = getelementptr i32, i32* %out, i32 4 3133 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in monotonic monotonic 3134 %val0 = extractvalue { i32, i1 } %val, 0 3135 store i32 %val0, i32* %out, align 4 3136 ret void 3137} 3138 3139define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg( 3140; GFX7-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: 3141; GFX7: ; %bb.0: ; %entry 3142; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3143; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 3144; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3145; GFX7-NEXT: s_add_u32 s4, s0, 16 3146; GFX7-NEXT: s_addc_u32 s5, s1, 0 3147; GFX7-NEXT: v_mov_b32_e32 v0, s4 3148; GFX7-NEXT: v_mov_b32_e32 v2, s2 3149; GFX7-NEXT: v_mov_b32_e32 v1, s5 3150; GFX7-NEXT: v_mov_b32_e32 v3, s3 3151; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3152; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3153; GFX7-NEXT: buffer_wbinvl1_vol 3154; GFX7-NEXT: v_mov_b32_e32 v0, s0 3155; GFX7-NEXT: v_mov_b32_e32 v1, s1 3156; GFX7-NEXT: flat_store_dword v[0:1], v2 3157; GFX7-NEXT: s_endpgm 3158; 3159; GFX10-WGP-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: 3160; GFX10-WGP: ; %bb.0: ; %entry 3161; GFX10-WGP-NEXT: s_clause 0x1 3162; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3163; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3164; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3165; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 3166; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 3167; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 3168; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 3169; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 3170; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 3171; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3172; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3173; GFX10-WGP-NEXT: buffer_gl0_inv 3174; GFX10-WGP-NEXT: buffer_gl1_inv 3175; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 3176; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 3177; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 3178; GFX10-WGP-NEXT: s_endpgm 3179; 3180; GFX10-CU-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: 3181; GFX10-CU: ; %bb.0: ; %entry 3182; GFX10-CU-NEXT: s_clause 0x1 3183; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3184; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3185; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3186; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 3187; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 3188; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 3189; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 3190; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 3191; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 3192; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3193; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3194; GFX10-CU-NEXT: buffer_gl0_inv 3195; GFX10-CU-NEXT: buffer_gl1_inv 3196; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 3197; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 3198; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 3199; GFX10-CU-NEXT: s_endpgm 3200; 3201; SKIP-CACHE-INV-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: 3202; SKIP-CACHE-INV: ; %bb.0: ; %entry 3203; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 3204; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 3205; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3206; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 3207; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 3208; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 3209; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 3210; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 3211; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 3212; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3213; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3214; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 3215; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 3216; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 3217; SKIP-CACHE-INV-NEXT: s_endpgm 3218; 3219; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: 3220; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 3221; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3222; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3223; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3224; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 3225; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 3226; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 3227; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3228; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 3229; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol 3230; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 3231; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 3232; 3233; GFX90A-TGSPLIT-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: 3234; GFX90A-TGSPLIT: ; %bb.0: ; %entry 3235; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3236; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3237; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3238; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 3239; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 3240; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 3241; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 3242; GFX90A-TGSPLIT-NEXT: buffer_invl2 3243; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 3244; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 3245; GFX90A-TGSPLIT-NEXT: s_endpgm 3246 i32* %out, i32 %in, i32 %old) { 3247entry: 3248 %gep = getelementptr i32, i32* %out, i32 4 3249 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acquire monotonic 3250 %val0 = extractvalue { i32, i1 } %val, 0 3251 store i32 %val0, i32* %out, align 4 3252 ret void 3253} 3254 3255define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg( 3256; GFX7-LABEL: flat_system_release_monotonic_ret_cmpxchg: 3257; GFX7: ; %bb.0: ; %entry 3258; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3259; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 3260; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3261; GFX7-NEXT: s_add_u32 s4, s0, 16 3262; GFX7-NEXT: s_addc_u32 s5, s1, 0 3263; GFX7-NEXT: v_mov_b32_e32 v0, s4 3264; GFX7-NEXT: v_mov_b32_e32 v2, s2 3265; GFX7-NEXT: v_mov_b32_e32 v1, s5 3266; GFX7-NEXT: v_mov_b32_e32 v3, s3 3267; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3268; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3269; GFX7-NEXT: v_mov_b32_e32 v0, s0 3270; GFX7-NEXT: v_mov_b32_e32 v1, s1 3271; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3272; GFX7-NEXT: flat_store_dword v[0:1], v2 3273; GFX7-NEXT: s_endpgm 3274; 3275; GFX10-WGP-LABEL: flat_system_release_monotonic_ret_cmpxchg: 3276; GFX10-WGP: ; %bb.0: ; %entry 3277; GFX10-WGP-NEXT: s_clause 0x1 3278; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3279; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3280; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3281; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 3282; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 3283; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 3284; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 3285; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 3286; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 3287; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3288; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 3289; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3290; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 3291; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 3292; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3293; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 3294; GFX10-WGP-NEXT: s_endpgm 3295; 3296; GFX10-CU-LABEL: flat_system_release_monotonic_ret_cmpxchg: 3297; GFX10-CU: ; %bb.0: ; %entry 3298; GFX10-CU-NEXT: s_clause 0x1 3299; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3300; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3301; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3302; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 3303; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 3304; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 3305; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 3306; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 3307; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 3308; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3309; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 3310; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3311; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 3312; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 3313; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3314; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 3315; GFX10-CU-NEXT: s_endpgm 3316; 3317; SKIP-CACHE-INV-LABEL: flat_system_release_monotonic_ret_cmpxchg: 3318; SKIP-CACHE-INV: ; %bb.0: ; %entry 3319; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 3320; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 3321; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3322; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 3323; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 3324; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 3325; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 3326; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 3327; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 3328; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3329; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3330; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 3331; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 3332; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3333; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 3334; SKIP-CACHE-INV-NEXT: s_endpgm 3335; 3336; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_monotonic_ret_cmpxchg: 3337; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 3338; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3339; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3340; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3341; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 3342; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 3343; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 3344; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3345; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 3346; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3347; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 3348; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 3349; 3350; GFX90A-TGSPLIT-LABEL: flat_system_release_monotonic_ret_cmpxchg: 3351; GFX90A-TGSPLIT: ; %bb.0: ; %entry 3352; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3353; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3354; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3355; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 3356; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 3357; GFX90A-TGSPLIT-NEXT: buffer_wbl2 3358; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3359; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 3360; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 3361; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 3362; GFX90A-TGSPLIT-NEXT: s_endpgm 3363 i32* %out, i32 %in, i32 %old) { 3364entry: 3365 %gep = getelementptr i32, i32* %out, i32 4 3366 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in release monotonic 3367 %val0 = extractvalue { i32, i1 } %val, 0 3368 store i32 %val0, i32* %out, align 4 3369 ret void 3370} 3371 3372define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg( 3373; GFX7-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: 3374; GFX7: ; %bb.0: ; %entry 3375; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3376; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 3377; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3378; GFX7-NEXT: s_add_u32 s4, s0, 16 3379; GFX7-NEXT: s_addc_u32 s5, s1, 0 3380; GFX7-NEXT: v_mov_b32_e32 v0, s4 3381; GFX7-NEXT: v_mov_b32_e32 v2, s2 3382; GFX7-NEXT: v_mov_b32_e32 v1, s5 3383; GFX7-NEXT: v_mov_b32_e32 v3, s3 3384; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3385; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3386; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3387; GFX7-NEXT: buffer_wbinvl1_vol 3388; GFX7-NEXT: v_mov_b32_e32 v0, s0 3389; GFX7-NEXT: v_mov_b32_e32 v1, s1 3390; GFX7-NEXT: flat_store_dword v[0:1], v2 3391; GFX7-NEXT: s_endpgm 3392; 3393; GFX10-WGP-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: 3394; GFX10-WGP: ; %bb.0: ; %entry 3395; GFX10-WGP-NEXT: s_clause 0x1 3396; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3397; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3398; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3399; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 3400; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 3401; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 3402; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 3403; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 3404; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 3405; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3406; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 3407; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3408; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3409; GFX10-WGP-NEXT: buffer_gl0_inv 3410; GFX10-WGP-NEXT: buffer_gl1_inv 3411; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 3412; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 3413; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 3414; GFX10-WGP-NEXT: s_endpgm 3415; 3416; GFX10-CU-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: 3417; GFX10-CU: ; %bb.0: ; %entry 3418; GFX10-CU-NEXT: s_clause 0x1 3419; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3420; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3421; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3422; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 3423; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 3424; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 3425; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 3426; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 3427; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 3428; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3429; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 3430; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3431; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3432; GFX10-CU-NEXT: buffer_gl0_inv 3433; GFX10-CU-NEXT: buffer_gl1_inv 3434; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 3435; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 3436; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 3437; GFX10-CU-NEXT: s_endpgm 3438; 3439; SKIP-CACHE-INV-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: 3440; SKIP-CACHE-INV: ; %bb.0: ; %entry 3441; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 3442; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 3443; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3444; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 3445; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 3446; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 3447; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 3448; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 3449; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 3450; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3451; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3452; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3453; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 3454; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 3455; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 3456; SKIP-CACHE-INV-NEXT: s_endpgm 3457; 3458; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: 3459; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 3460; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3461; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3462; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3463; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 3464; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 3465; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 3466; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3467; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 3468; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3469; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 3470; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol 3471; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 3472; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 3473; 3474; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: 3475; GFX90A-TGSPLIT: ; %bb.0: ; %entry 3476; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3477; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3478; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3479; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 3480; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 3481; GFX90A-TGSPLIT-NEXT: buffer_wbl2 3482; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3483; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 3484; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 3485; GFX90A-TGSPLIT-NEXT: buffer_invl2 3486; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 3487; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 3488; GFX90A-TGSPLIT-NEXT: s_endpgm 3489 i32* %out, i32 %in, i32 %old) { 3490entry: 3491 %gep = getelementptr i32, i32* %out, i32 4 3492 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acq_rel monotonic 3493 %val0 = extractvalue { i32, i1 } %val, 0 3494 store i32 %val0, i32* %out, align 4 3495 ret void 3496} 3497 3498define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg( 3499; GFX7-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: 3500; GFX7: ; %bb.0: ; %entry 3501; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3502; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 3503; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3504; GFX7-NEXT: s_add_u32 s4, s0, 16 3505; GFX7-NEXT: s_addc_u32 s5, s1, 0 3506; GFX7-NEXT: v_mov_b32_e32 v0, s4 3507; GFX7-NEXT: v_mov_b32_e32 v2, s2 3508; GFX7-NEXT: v_mov_b32_e32 v1, s5 3509; GFX7-NEXT: v_mov_b32_e32 v3, s3 3510; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3511; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3512; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3513; GFX7-NEXT: buffer_wbinvl1_vol 3514; GFX7-NEXT: v_mov_b32_e32 v0, s0 3515; GFX7-NEXT: v_mov_b32_e32 v1, s1 3516; GFX7-NEXT: flat_store_dword v[0:1], v2 3517; GFX7-NEXT: s_endpgm 3518; 3519; GFX10-WGP-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: 3520; GFX10-WGP: ; %bb.0: ; %entry 3521; GFX10-WGP-NEXT: s_clause 0x1 3522; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3523; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3524; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3525; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 3526; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 3527; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 3528; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 3529; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 3530; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 3531; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3532; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 3533; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3534; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3535; GFX10-WGP-NEXT: buffer_gl0_inv 3536; GFX10-WGP-NEXT: buffer_gl1_inv 3537; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 3538; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 3539; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 3540; GFX10-WGP-NEXT: s_endpgm 3541; 3542; GFX10-CU-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: 3543; GFX10-CU: ; %bb.0: ; %entry 3544; GFX10-CU-NEXT: s_clause 0x1 3545; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3546; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3547; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3548; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 3549; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 3550; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 3551; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 3552; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 3553; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 3554; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3555; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 3556; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3557; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3558; GFX10-CU-NEXT: buffer_gl0_inv 3559; GFX10-CU-NEXT: buffer_gl1_inv 3560; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 3561; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 3562; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 3563; GFX10-CU-NEXT: s_endpgm 3564; 3565; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: 3566; SKIP-CACHE-INV: ; %bb.0: ; %entry 3567; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 3568; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 3569; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3570; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 3571; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 3572; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 3573; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 3574; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 3575; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 3576; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3577; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3578; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3579; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 3580; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 3581; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 3582; SKIP-CACHE-INV-NEXT: s_endpgm 3583; 3584; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: 3585; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 3586; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3587; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3588; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3589; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 3590; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 3591; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 3592; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3593; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 3594; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3595; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 3596; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol 3597; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 3598; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 3599; 3600; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: 3601; GFX90A-TGSPLIT: ; %bb.0: ; %entry 3602; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3603; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3604; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3605; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 3606; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 3607; GFX90A-TGSPLIT-NEXT: buffer_wbl2 3608; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3609; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 3610; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 3611; GFX90A-TGSPLIT-NEXT: buffer_invl2 3612; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 3613; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 3614; GFX90A-TGSPLIT-NEXT: s_endpgm 3615 i32* %out, i32 %in, i32 %old) { 3616entry: 3617 %gep = getelementptr i32, i32* %out, i32 4 3618 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in seq_cst monotonic 3619 %val0 = extractvalue { i32, i1 } %val, 0 3620 store i32 %val0, i32* %out, align 4 3621 ret void 3622} 3623 3624define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg( 3625; GFX7-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: 3626; GFX7: ; %bb.0: ; %entry 3627; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3628; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 3629; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3630; GFX7-NEXT: s_add_u32 s4, s0, 16 3631; GFX7-NEXT: s_addc_u32 s5, s1, 0 3632; GFX7-NEXT: v_mov_b32_e32 v0, s4 3633; GFX7-NEXT: v_mov_b32_e32 v2, s2 3634; GFX7-NEXT: v_mov_b32_e32 v1, s5 3635; GFX7-NEXT: v_mov_b32_e32 v3, s3 3636; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3637; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3638; GFX7-NEXT: buffer_wbinvl1_vol 3639; GFX7-NEXT: v_mov_b32_e32 v0, s0 3640; GFX7-NEXT: v_mov_b32_e32 v1, s1 3641; GFX7-NEXT: flat_store_dword v[0:1], v2 3642; GFX7-NEXT: s_endpgm 3643; 3644; GFX10-WGP-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: 3645; GFX10-WGP: ; %bb.0: ; %entry 3646; GFX10-WGP-NEXT: s_clause 0x1 3647; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3648; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3649; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3650; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 3651; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 3652; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 3653; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 3654; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 3655; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 3656; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3657; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3658; GFX10-WGP-NEXT: buffer_gl0_inv 3659; GFX10-WGP-NEXT: buffer_gl1_inv 3660; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 3661; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 3662; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 3663; GFX10-WGP-NEXT: s_endpgm 3664; 3665; GFX10-CU-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: 3666; GFX10-CU: ; %bb.0: ; %entry 3667; GFX10-CU-NEXT: s_clause 0x1 3668; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3669; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3670; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3671; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 3672; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 3673; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 3674; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 3675; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 3676; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 3677; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3678; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3679; GFX10-CU-NEXT: buffer_gl0_inv 3680; GFX10-CU-NEXT: buffer_gl1_inv 3681; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 3682; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 3683; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 3684; GFX10-CU-NEXT: s_endpgm 3685; 3686; SKIP-CACHE-INV-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: 3687; SKIP-CACHE-INV: ; %bb.0: ; %entry 3688; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 3689; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 3690; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3691; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 3692; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 3693; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 3694; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 3695; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 3696; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 3697; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3698; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3699; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 3700; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 3701; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 3702; SKIP-CACHE-INV-NEXT: s_endpgm 3703; 3704; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: 3705; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 3706; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3707; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3708; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3709; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 3710; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 3711; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 3712; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3713; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 3714; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol 3715; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 3716; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 3717; 3718; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: 3719; GFX90A-TGSPLIT: ; %bb.0: ; %entry 3720; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3721; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3722; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3723; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 3724; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 3725; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 3726; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 3727; GFX90A-TGSPLIT-NEXT: buffer_invl2 3728; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 3729; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 3730; GFX90A-TGSPLIT-NEXT: s_endpgm 3731 i32* %out, i32 %in, i32 %old) { 3732entry: 3733 %gep = getelementptr i32, i32* %out, i32 4 3734 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in monotonic acquire 3735 %val0 = extractvalue { i32, i1 } %val, 0 3736 store i32 %val0, i32* %out, align 4 3737 ret void 3738} 3739 3740define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg( 3741; GFX7-LABEL: flat_system_acquire_acquire_ret_cmpxchg: 3742; GFX7: ; %bb.0: ; %entry 3743; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3744; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 3745; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3746; GFX7-NEXT: s_add_u32 s4, s0, 16 3747; GFX7-NEXT: s_addc_u32 s5, s1, 0 3748; GFX7-NEXT: v_mov_b32_e32 v0, s4 3749; GFX7-NEXT: v_mov_b32_e32 v2, s2 3750; GFX7-NEXT: v_mov_b32_e32 v1, s5 3751; GFX7-NEXT: v_mov_b32_e32 v3, s3 3752; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3753; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3754; GFX7-NEXT: buffer_wbinvl1_vol 3755; GFX7-NEXT: v_mov_b32_e32 v0, s0 3756; GFX7-NEXT: v_mov_b32_e32 v1, s1 3757; GFX7-NEXT: flat_store_dword v[0:1], v2 3758; GFX7-NEXT: s_endpgm 3759; 3760; GFX10-WGP-LABEL: flat_system_acquire_acquire_ret_cmpxchg: 3761; GFX10-WGP: ; %bb.0: ; %entry 3762; GFX10-WGP-NEXT: s_clause 0x1 3763; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3764; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3765; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3766; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 3767; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 3768; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 3769; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 3770; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 3771; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 3772; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3773; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3774; GFX10-WGP-NEXT: buffer_gl0_inv 3775; GFX10-WGP-NEXT: buffer_gl1_inv 3776; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 3777; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 3778; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 3779; GFX10-WGP-NEXT: s_endpgm 3780; 3781; GFX10-CU-LABEL: flat_system_acquire_acquire_ret_cmpxchg: 3782; GFX10-CU: ; %bb.0: ; %entry 3783; GFX10-CU-NEXT: s_clause 0x1 3784; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3785; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3786; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3787; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 3788; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 3789; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 3790; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 3791; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 3792; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 3793; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3794; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3795; GFX10-CU-NEXT: buffer_gl0_inv 3796; GFX10-CU-NEXT: buffer_gl1_inv 3797; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 3798; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 3799; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 3800; GFX10-CU-NEXT: s_endpgm 3801; 3802; SKIP-CACHE-INV-LABEL: flat_system_acquire_acquire_ret_cmpxchg: 3803; SKIP-CACHE-INV: ; %bb.0: ; %entry 3804; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 3805; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 3806; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3807; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 3808; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 3809; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 3810; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 3811; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 3812; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 3813; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3814; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3815; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 3816; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 3817; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 3818; SKIP-CACHE-INV-NEXT: s_endpgm 3819; 3820; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_acquire_ret_cmpxchg: 3821; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 3822; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3823; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3824; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3825; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 3826; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 3827; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 3828; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3829; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 3830; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol 3831; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 3832; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 3833; 3834; GFX90A-TGSPLIT-LABEL: flat_system_acquire_acquire_ret_cmpxchg: 3835; GFX90A-TGSPLIT: ; %bb.0: ; %entry 3836; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3837; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3838; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3839; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 3840; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 3841; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 3842; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 3843; GFX90A-TGSPLIT-NEXT: buffer_invl2 3844; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 3845; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 3846; GFX90A-TGSPLIT-NEXT: s_endpgm 3847 i32* %out, i32 %in, i32 %old) { 3848entry: 3849 %gep = getelementptr i32, i32* %out, i32 4 3850 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acquire acquire 3851 %val0 = extractvalue { i32, i1 } %val, 0 3852 store i32 %val0, i32* %out, align 4 3853 ret void 3854} 3855 3856define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg( 3857; GFX7-LABEL: flat_system_release_acquire_ret_cmpxchg: 3858; GFX7: ; %bb.0: ; %entry 3859; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3860; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 3861; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3862; GFX7-NEXT: s_add_u32 s4, s0, 16 3863; GFX7-NEXT: s_addc_u32 s5, s1, 0 3864; GFX7-NEXT: v_mov_b32_e32 v0, s4 3865; GFX7-NEXT: v_mov_b32_e32 v2, s2 3866; GFX7-NEXT: v_mov_b32_e32 v1, s5 3867; GFX7-NEXT: v_mov_b32_e32 v3, s3 3868; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3869; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3870; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3871; GFX7-NEXT: buffer_wbinvl1_vol 3872; GFX7-NEXT: v_mov_b32_e32 v0, s0 3873; GFX7-NEXT: v_mov_b32_e32 v1, s1 3874; GFX7-NEXT: flat_store_dword v[0:1], v2 3875; GFX7-NEXT: s_endpgm 3876; 3877; GFX10-WGP-LABEL: flat_system_release_acquire_ret_cmpxchg: 3878; GFX10-WGP: ; %bb.0: ; %entry 3879; GFX10-WGP-NEXT: s_clause 0x1 3880; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3881; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3882; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3883; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 3884; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 3885; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 3886; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 3887; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 3888; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 3889; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3890; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 3891; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3892; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3893; GFX10-WGP-NEXT: buffer_gl0_inv 3894; GFX10-WGP-NEXT: buffer_gl1_inv 3895; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 3896; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 3897; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 3898; GFX10-WGP-NEXT: s_endpgm 3899; 3900; GFX10-CU-LABEL: flat_system_release_acquire_ret_cmpxchg: 3901; GFX10-CU: ; %bb.0: ; %entry 3902; GFX10-CU-NEXT: s_clause 0x1 3903; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3904; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3905; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3906; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 3907; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 3908; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 3909; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 3910; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 3911; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 3912; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3913; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 3914; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3915; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3916; GFX10-CU-NEXT: buffer_gl0_inv 3917; GFX10-CU-NEXT: buffer_gl1_inv 3918; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 3919; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 3920; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 3921; GFX10-CU-NEXT: s_endpgm 3922; 3923; SKIP-CACHE-INV-LABEL: flat_system_release_acquire_ret_cmpxchg: 3924; SKIP-CACHE-INV: ; %bb.0: ; %entry 3925; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 3926; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 3927; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3928; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 3929; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 3930; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 3931; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 3932; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 3933; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 3934; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3935; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3936; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3937; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 3938; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 3939; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 3940; SKIP-CACHE-INV-NEXT: s_endpgm 3941; 3942; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_acquire_ret_cmpxchg: 3943; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 3944; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3945; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3946; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3947; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 3948; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 3949; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 3950; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3951; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 3952; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3953; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 3954; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol 3955; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 3956; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 3957; 3958; GFX90A-TGSPLIT-LABEL: flat_system_release_acquire_ret_cmpxchg: 3959; GFX90A-TGSPLIT: ; %bb.0: ; %entry 3960; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3961; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3962; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3963; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 3964; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 3965; GFX90A-TGSPLIT-NEXT: buffer_wbl2 3966; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3967; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 3968; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 3969; GFX90A-TGSPLIT-NEXT: buffer_invl2 3970; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 3971; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 3972; GFX90A-TGSPLIT-NEXT: s_endpgm 3973 i32* %out, i32 %in, i32 %old) { 3974entry: 3975 %gep = getelementptr i32, i32* %out, i32 4 3976 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in release acquire 3977 %val0 = extractvalue { i32, i1 } %val, 0 3978 store i32 %val0, i32* %out, align 4 3979 ret void 3980} 3981 3982define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg( 3983; GFX7-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: 3984; GFX7: ; %bb.0: ; %entry 3985; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3986; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 3987; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3988; GFX7-NEXT: s_add_u32 s4, s0, 16 3989; GFX7-NEXT: s_addc_u32 s5, s1, 0 3990; GFX7-NEXT: v_mov_b32_e32 v0, s4 3991; GFX7-NEXT: v_mov_b32_e32 v2, s2 3992; GFX7-NEXT: v_mov_b32_e32 v1, s5 3993; GFX7-NEXT: v_mov_b32_e32 v3, s3 3994; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3995; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3996; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3997; GFX7-NEXT: buffer_wbinvl1_vol 3998; GFX7-NEXT: v_mov_b32_e32 v0, s0 3999; GFX7-NEXT: v_mov_b32_e32 v1, s1 4000; GFX7-NEXT: flat_store_dword v[0:1], v2 4001; GFX7-NEXT: s_endpgm 4002; 4003; GFX10-WGP-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: 4004; GFX10-WGP: ; %bb.0: ; %entry 4005; GFX10-WGP-NEXT: s_clause 0x1 4006; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4007; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4008; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4009; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 4010; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 4011; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 4012; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 4013; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 4014; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 4015; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4016; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 4017; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4018; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4019; GFX10-WGP-NEXT: buffer_gl0_inv 4020; GFX10-WGP-NEXT: buffer_gl1_inv 4021; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4022; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 4023; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 4024; GFX10-WGP-NEXT: s_endpgm 4025; 4026; GFX10-CU-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: 4027; GFX10-CU: ; %bb.0: ; %entry 4028; GFX10-CU-NEXT: s_clause 0x1 4029; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4030; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4031; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4032; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 4033; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 4034; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 4035; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 4036; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 4037; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 4038; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4039; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 4040; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4041; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4042; GFX10-CU-NEXT: buffer_gl0_inv 4043; GFX10-CU-NEXT: buffer_gl1_inv 4044; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 4045; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 4046; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 4047; GFX10-CU-NEXT: s_endpgm 4048; 4049; SKIP-CACHE-INV-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: 4050; SKIP-CACHE-INV: ; %bb.0: ; %entry 4051; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 4052; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4053; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4054; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 4055; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 4056; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 4057; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 4058; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 4059; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 4060; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4061; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4062; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4063; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 4064; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 4065; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 4066; SKIP-CACHE-INV-NEXT: s_endpgm 4067; 4068; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: 4069; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 4070; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4071; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4072; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4073; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 4074; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 4075; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 4076; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4077; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 4078; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4079; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 4080; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol 4081; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 4082; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 4083; 4084; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: 4085; GFX90A-TGSPLIT: ; %bb.0: ; %entry 4086; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4087; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4088; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4089; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 4090; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 4091; GFX90A-TGSPLIT-NEXT: buffer_wbl2 4092; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4093; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 4094; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 4095; GFX90A-TGSPLIT-NEXT: buffer_invl2 4096; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 4097; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 4098; GFX90A-TGSPLIT-NEXT: s_endpgm 4099 i32* %out, i32 %in, i32 %old) { 4100entry: 4101 %gep = getelementptr i32, i32* %out, i32 4 4102 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acq_rel acquire 4103 %val0 = extractvalue { i32, i1 } %val, 0 4104 store i32 %val0, i32* %out, align 4 4105 ret void 4106} 4107 4108define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg( 4109; GFX7-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: 4110; GFX7: ; %bb.0: ; %entry 4111; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4112; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 4113; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4114; GFX7-NEXT: s_add_u32 s4, s0, 16 4115; GFX7-NEXT: s_addc_u32 s5, s1, 0 4116; GFX7-NEXT: v_mov_b32_e32 v0, s4 4117; GFX7-NEXT: v_mov_b32_e32 v2, s2 4118; GFX7-NEXT: v_mov_b32_e32 v1, s5 4119; GFX7-NEXT: v_mov_b32_e32 v3, s3 4120; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4121; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4122; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4123; GFX7-NEXT: buffer_wbinvl1_vol 4124; GFX7-NEXT: v_mov_b32_e32 v0, s0 4125; GFX7-NEXT: v_mov_b32_e32 v1, s1 4126; GFX7-NEXT: flat_store_dword v[0:1], v2 4127; GFX7-NEXT: s_endpgm 4128; 4129; GFX10-WGP-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: 4130; GFX10-WGP: ; %bb.0: ; %entry 4131; GFX10-WGP-NEXT: s_clause 0x1 4132; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4133; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4134; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4135; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 4136; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 4137; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 4138; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 4139; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 4140; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 4141; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4142; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 4143; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4144; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4145; GFX10-WGP-NEXT: buffer_gl0_inv 4146; GFX10-WGP-NEXT: buffer_gl1_inv 4147; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4148; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 4149; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 4150; GFX10-WGP-NEXT: s_endpgm 4151; 4152; GFX10-CU-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: 4153; GFX10-CU: ; %bb.0: ; %entry 4154; GFX10-CU-NEXT: s_clause 0x1 4155; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4156; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4157; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4158; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 4159; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 4160; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 4161; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 4162; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 4163; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 4164; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4165; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 4166; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4167; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4168; GFX10-CU-NEXT: buffer_gl0_inv 4169; GFX10-CU-NEXT: buffer_gl1_inv 4170; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 4171; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 4172; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 4173; GFX10-CU-NEXT: s_endpgm 4174; 4175; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: 4176; SKIP-CACHE-INV: ; %bb.0: ; %entry 4177; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 4178; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4179; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4180; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 4181; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 4182; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 4183; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 4184; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 4185; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 4186; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4187; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4188; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4189; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 4190; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 4191; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 4192; SKIP-CACHE-INV-NEXT: s_endpgm 4193; 4194; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: 4195; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 4196; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4197; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4198; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4199; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 4200; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 4201; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 4202; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4203; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 4204; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4205; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 4206; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol 4207; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 4208; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 4209; 4210; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: 4211; GFX90A-TGSPLIT: ; %bb.0: ; %entry 4212; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4213; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4214; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4215; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 4216; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 4217; GFX90A-TGSPLIT-NEXT: buffer_wbl2 4218; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4219; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 4220; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 4221; GFX90A-TGSPLIT-NEXT: buffer_invl2 4222; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 4223; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 4224; GFX90A-TGSPLIT-NEXT: s_endpgm 4225 i32* %out, i32 %in, i32 %old) { 4226entry: 4227 %gep = getelementptr i32, i32* %out, i32 4 4228 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in seq_cst acquire 4229 %val0 = extractvalue { i32, i1 } %val, 0 4230 store i32 %val0, i32* %out, align 4 4231 ret void 4232} 4233 4234define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg( 4235; GFX7-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: 4236; GFX7: ; %bb.0: ; %entry 4237; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4238; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 4239; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4240; GFX7-NEXT: s_add_u32 s4, s0, 16 4241; GFX7-NEXT: s_addc_u32 s5, s1, 0 4242; GFX7-NEXT: v_mov_b32_e32 v0, s4 4243; GFX7-NEXT: v_mov_b32_e32 v2, s2 4244; GFX7-NEXT: v_mov_b32_e32 v1, s5 4245; GFX7-NEXT: v_mov_b32_e32 v3, s3 4246; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4247; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4248; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4249; GFX7-NEXT: buffer_wbinvl1_vol 4250; GFX7-NEXT: v_mov_b32_e32 v0, s0 4251; GFX7-NEXT: v_mov_b32_e32 v1, s1 4252; GFX7-NEXT: flat_store_dword v[0:1], v2 4253; GFX7-NEXT: s_endpgm 4254; 4255; GFX10-WGP-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: 4256; GFX10-WGP: ; %bb.0: ; %entry 4257; GFX10-WGP-NEXT: s_clause 0x1 4258; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4259; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4260; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4261; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 4262; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 4263; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 4264; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 4265; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 4266; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 4267; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4268; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 4269; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4270; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4271; GFX10-WGP-NEXT: buffer_gl0_inv 4272; GFX10-WGP-NEXT: buffer_gl1_inv 4273; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4274; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 4275; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 4276; GFX10-WGP-NEXT: s_endpgm 4277; 4278; GFX10-CU-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: 4279; GFX10-CU: ; %bb.0: ; %entry 4280; GFX10-CU-NEXT: s_clause 0x1 4281; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4282; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4283; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4284; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 4285; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 4286; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 4287; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 4288; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 4289; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 4290; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4291; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 4292; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4293; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4294; GFX10-CU-NEXT: buffer_gl0_inv 4295; GFX10-CU-NEXT: buffer_gl1_inv 4296; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 4297; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 4298; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 4299; GFX10-CU-NEXT: s_endpgm 4300; 4301; SKIP-CACHE-INV-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: 4302; SKIP-CACHE-INV: ; %bb.0: ; %entry 4303; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 4304; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4305; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4306; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 4307; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 4308; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 4309; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 4310; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 4311; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 4312; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4313; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4314; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4315; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 4316; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 4317; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 4318; SKIP-CACHE-INV-NEXT: s_endpgm 4319; 4320; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: 4321; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 4322; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4323; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4324; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4325; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 4326; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 4327; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 4328; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4329; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 4330; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4331; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 4332; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol 4333; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 4334; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 4335; 4336; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: 4337; GFX90A-TGSPLIT: ; %bb.0: ; %entry 4338; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4339; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4340; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4341; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 4342; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 4343; GFX90A-TGSPLIT-NEXT: buffer_wbl2 4344; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4345; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 4346; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 4347; GFX90A-TGSPLIT-NEXT: buffer_invl2 4348; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 4349; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 4350; GFX90A-TGSPLIT-NEXT: s_endpgm 4351 i32* %out, i32 %in, i32 %old) { 4352entry: 4353 %gep = getelementptr i32, i32* %out, i32 4 4354 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in monotonic seq_cst 4355 %val0 = extractvalue { i32, i1 } %val, 0 4356 store i32 %val0, i32* %out, align 4 4357 ret void 4358} 4359 4360define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg( 4361; GFX7-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: 4362; GFX7: ; %bb.0: ; %entry 4363; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4364; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 4365; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4366; GFX7-NEXT: s_add_u32 s4, s0, 16 4367; GFX7-NEXT: s_addc_u32 s5, s1, 0 4368; GFX7-NEXT: v_mov_b32_e32 v0, s4 4369; GFX7-NEXT: v_mov_b32_e32 v2, s2 4370; GFX7-NEXT: v_mov_b32_e32 v1, s5 4371; GFX7-NEXT: v_mov_b32_e32 v3, s3 4372; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4373; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4374; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4375; GFX7-NEXT: buffer_wbinvl1_vol 4376; GFX7-NEXT: v_mov_b32_e32 v0, s0 4377; GFX7-NEXT: v_mov_b32_e32 v1, s1 4378; GFX7-NEXT: flat_store_dword v[0:1], v2 4379; GFX7-NEXT: s_endpgm 4380; 4381; GFX10-WGP-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: 4382; GFX10-WGP: ; %bb.0: ; %entry 4383; GFX10-WGP-NEXT: s_clause 0x1 4384; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4385; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4386; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4387; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 4388; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 4389; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 4390; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 4391; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 4392; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 4393; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4394; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 4395; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4396; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4397; GFX10-WGP-NEXT: buffer_gl0_inv 4398; GFX10-WGP-NEXT: buffer_gl1_inv 4399; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4400; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 4401; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 4402; GFX10-WGP-NEXT: s_endpgm 4403; 4404; GFX10-CU-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: 4405; GFX10-CU: ; %bb.0: ; %entry 4406; GFX10-CU-NEXT: s_clause 0x1 4407; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4408; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4409; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4410; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 4411; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 4412; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 4413; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 4414; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 4415; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 4416; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4417; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 4418; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4419; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4420; GFX10-CU-NEXT: buffer_gl0_inv 4421; GFX10-CU-NEXT: buffer_gl1_inv 4422; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 4423; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 4424; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 4425; GFX10-CU-NEXT: s_endpgm 4426; 4427; SKIP-CACHE-INV-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: 4428; SKIP-CACHE-INV: ; %bb.0: ; %entry 4429; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 4430; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4431; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4432; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 4433; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 4434; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 4435; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 4436; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 4437; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 4438; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4439; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4440; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4441; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 4442; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 4443; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 4444; SKIP-CACHE-INV-NEXT: s_endpgm 4445; 4446; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: 4447; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 4448; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4449; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4450; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4451; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 4452; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 4453; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 4454; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4455; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 4456; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4457; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 4458; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol 4459; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 4460; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 4461; 4462; GFX90A-TGSPLIT-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: 4463; GFX90A-TGSPLIT: ; %bb.0: ; %entry 4464; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4465; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4466; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4467; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 4468; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 4469; GFX90A-TGSPLIT-NEXT: buffer_wbl2 4470; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4471; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 4472; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 4473; GFX90A-TGSPLIT-NEXT: buffer_invl2 4474; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 4475; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 4476; GFX90A-TGSPLIT-NEXT: s_endpgm 4477 i32* %out, i32 %in, i32 %old) { 4478entry: 4479 %gep = getelementptr i32, i32* %out, i32 4 4480 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acquire seq_cst 4481 %val0 = extractvalue { i32, i1 } %val, 0 4482 store i32 %val0, i32* %out, align 4 4483 ret void 4484} 4485 4486define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg( 4487; GFX7-LABEL: flat_system_release_seq_cst_ret_cmpxchg: 4488; GFX7: ; %bb.0: ; %entry 4489; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4490; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 4491; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4492; GFX7-NEXT: s_add_u32 s4, s0, 16 4493; GFX7-NEXT: s_addc_u32 s5, s1, 0 4494; GFX7-NEXT: v_mov_b32_e32 v0, s4 4495; GFX7-NEXT: v_mov_b32_e32 v2, s2 4496; GFX7-NEXT: v_mov_b32_e32 v1, s5 4497; GFX7-NEXT: v_mov_b32_e32 v3, s3 4498; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4499; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4500; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4501; GFX7-NEXT: buffer_wbinvl1_vol 4502; GFX7-NEXT: v_mov_b32_e32 v0, s0 4503; GFX7-NEXT: v_mov_b32_e32 v1, s1 4504; GFX7-NEXT: flat_store_dword v[0:1], v2 4505; GFX7-NEXT: s_endpgm 4506; 4507; GFX10-WGP-LABEL: flat_system_release_seq_cst_ret_cmpxchg: 4508; GFX10-WGP: ; %bb.0: ; %entry 4509; GFX10-WGP-NEXT: s_clause 0x1 4510; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4511; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4512; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4513; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 4514; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 4515; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 4516; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 4517; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 4518; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 4519; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4520; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 4521; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4522; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4523; GFX10-WGP-NEXT: buffer_gl0_inv 4524; GFX10-WGP-NEXT: buffer_gl1_inv 4525; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4526; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 4527; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 4528; GFX10-WGP-NEXT: s_endpgm 4529; 4530; GFX10-CU-LABEL: flat_system_release_seq_cst_ret_cmpxchg: 4531; GFX10-CU: ; %bb.0: ; %entry 4532; GFX10-CU-NEXT: s_clause 0x1 4533; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4534; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4535; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4536; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 4537; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 4538; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 4539; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 4540; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 4541; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 4542; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4543; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 4544; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4545; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4546; GFX10-CU-NEXT: buffer_gl0_inv 4547; GFX10-CU-NEXT: buffer_gl1_inv 4548; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 4549; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 4550; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 4551; GFX10-CU-NEXT: s_endpgm 4552; 4553; SKIP-CACHE-INV-LABEL: flat_system_release_seq_cst_ret_cmpxchg: 4554; SKIP-CACHE-INV: ; %bb.0: ; %entry 4555; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 4556; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4557; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4558; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 4559; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 4560; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 4561; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 4562; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 4563; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 4564; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4565; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4566; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4567; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 4568; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 4569; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 4570; SKIP-CACHE-INV-NEXT: s_endpgm 4571; 4572; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_seq_cst_ret_cmpxchg: 4573; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 4574; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4575; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4576; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4577; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 4578; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 4579; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 4580; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4581; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 4582; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4583; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 4584; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol 4585; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 4586; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 4587; 4588; GFX90A-TGSPLIT-LABEL: flat_system_release_seq_cst_ret_cmpxchg: 4589; GFX90A-TGSPLIT: ; %bb.0: ; %entry 4590; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4591; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4592; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4593; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 4594; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 4595; GFX90A-TGSPLIT-NEXT: buffer_wbl2 4596; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4597; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 4598; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 4599; GFX90A-TGSPLIT-NEXT: buffer_invl2 4600; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 4601; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 4602; GFX90A-TGSPLIT-NEXT: s_endpgm 4603 i32* %out, i32 %in, i32 %old) { 4604entry: 4605 %gep = getelementptr i32, i32* %out, i32 4 4606 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in release seq_cst 4607 %val0 = extractvalue { i32, i1 } %val, 0 4608 store i32 %val0, i32* %out, align 4 4609 ret void 4610} 4611 4612define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg( 4613; GFX7-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: 4614; GFX7: ; %bb.0: ; %entry 4615; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4616; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 4617; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4618; GFX7-NEXT: s_add_u32 s4, s0, 16 4619; GFX7-NEXT: s_addc_u32 s5, s1, 0 4620; GFX7-NEXT: v_mov_b32_e32 v0, s4 4621; GFX7-NEXT: v_mov_b32_e32 v2, s2 4622; GFX7-NEXT: v_mov_b32_e32 v1, s5 4623; GFX7-NEXT: v_mov_b32_e32 v3, s3 4624; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4625; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4626; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4627; GFX7-NEXT: buffer_wbinvl1_vol 4628; GFX7-NEXT: v_mov_b32_e32 v0, s0 4629; GFX7-NEXT: v_mov_b32_e32 v1, s1 4630; GFX7-NEXT: flat_store_dword v[0:1], v2 4631; GFX7-NEXT: s_endpgm 4632; 4633; GFX10-WGP-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: 4634; GFX10-WGP: ; %bb.0: ; %entry 4635; GFX10-WGP-NEXT: s_clause 0x1 4636; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4637; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4638; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4639; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 4640; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 4641; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 4642; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 4643; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 4644; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 4645; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4646; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 4647; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4648; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4649; GFX10-WGP-NEXT: buffer_gl0_inv 4650; GFX10-WGP-NEXT: buffer_gl1_inv 4651; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4652; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 4653; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 4654; GFX10-WGP-NEXT: s_endpgm 4655; 4656; GFX10-CU-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: 4657; GFX10-CU: ; %bb.0: ; %entry 4658; GFX10-CU-NEXT: s_clause 0x1 4659; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4660; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4661; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4662; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 4663; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 4664; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 4665; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 4666; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 4667; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 4668; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4669; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 4670; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4671; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4672; GFX10-CU-NEXT: buffer_gl0_inv 4673; GFX10-CU-NEXT: buffer_gl1_inv 4674; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 4675; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 4676; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 4677; GFX10-CU-NEXT: s_endpgm 4678; 4679; SKIP-CACHE-INV-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: 4680; SKIP-CACHE-INV: ; %bb.0: ; %entry 4681; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 4682; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4683; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4684; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 4685; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 4686; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 4687; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 4688; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 4689; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 4690; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4691; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4692; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4693; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 4694; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 4695; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 4696; SKIP-CACHE-INV-NEXT: s_endpgm 4697; 4698; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: 4699; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 4700; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4701; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4702; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4703; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 4704; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 4705; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 4706; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4707; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 4708; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4709; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 4710; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol 4711; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 4712; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 4713; 4714; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: 4715; GFX90A-TGSPLIT: ; %bb.0: ; %entry 4716; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4717; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4718; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4719; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 4720; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 4721; GFX90A-TGSPLIT-NEXT: buffer_wbl2 4722; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4723; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 4724; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 4725; GFX90A-TGSPLIT-NEXT: buffer_invl2 4726; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 4727; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 4728; GFX90A-TGSPLIT-NEXT: s_endpgm 4729 i32* %out, i32 %in, i32 %old) { 4730entry: 4731 %gep = getelementptr i32, i32* %out, i32 4 4732 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acq_rel seq_cst 4733 %val0 = extractvalue { i32, i1 } %val, 0 4734 store i32 %val0, i32* %out, align 4 4735 ret void 4736} 4737 4738define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg( 4739; GFX7-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: 4740; GFX7: ; %bb.0: ; %entry 4741; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4742; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 4743; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4744; GFX7-NEXT: s_add_u32 s4, s0, 16 4745; GFX7-NEXT: s_addc_u32 s5, s1, 0 4746; GFX7-NEXT: v_mov_b32_e32 v0, s4 4747; GFX7-NEXT: v_mov_b32_e32 v2, s2 4748; GFX7-NEXT: v_mov_b32_e32 v1, s5 4749; GFX7-NEXT: v_mov_b32_e32 v3, s3 4750; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4751; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4752; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4753; GFX7-NEXT: buffer_wbinvl1_vol 4754; GFX7-NEXT: v_mov_b32_e32 v0, s0 4755; GFX7-NEXT: v_mov_b32_e32 v1, s1 4756; GFX7-NEXT: flat_store_dword v[0:1], v2 4757; GFX7-NEXT: s_endpgm 4758; 4759; GFX10-WGP-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: 4760; GFX10-WGP: ; %bb.0: ; %entry 4761; GFX10-WGP-NEXT: s_clause 0x1 4762; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4763; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4764; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4765; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 4766; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 4767; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 4768; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 4769; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 4770; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 4771; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4772; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 4773; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4774; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4775; GFX10-WGP-NEXT: buffer_gl0_inv 4776; GFX10-WGP-NEXT: buffer_gl1_inv 4777; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4778; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 4779; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 4780; GFX10-WGP-NEXT: s_endpgm 4781; 4782; GFX10-CU-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: 4783; GFX10-CU: ; %bb.0: ; %entry 4784; GFX10-CU-NEXT: s_clause 0x1 4785; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4786; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4787; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4788; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 4789; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 4790; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 4791; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 4792; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 4793; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 4794; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4795; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 4796; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4797; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4798; GFX10-CU-NEXT: buffer_gl0_inv 4799; GFX10-CU-NEXT: buffer_gl1_inv 4800; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 4801; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 4802; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 4803; GFX10-CU-NEXT: s_endpgm 4804; 4805; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: 4806; SKIP-CACHE-INV: ; %bb.0: ; %entry 4807; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 4808; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4809; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4810; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 4811; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 4812; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 4813; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 4814; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 4815; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 4816; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4817; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4818; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4819; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 4820; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 4821; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 4822; SKIP-CACHE-INV-NEXT: s_endpgm 4823; 4824; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: 4825; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 4826; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4827; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4828; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4829; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 4830; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 4831; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 4832; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4833; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 4834; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4835; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 4836; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol 4837; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 4838; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 4839; 4840; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: 4841; GFX90A-TGSPLIT: ; %bb.0: ; %entry 4842; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4843; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4844; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4845; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 4846; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 4847; GFX90A-TGSPLIT-NEXT: buffer_wbl2 4848; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4849; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 4850; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 4851; GFX90A-TGSPLIT-NEXT: buffer_invl2 4852; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 4853; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 4854; GFX90A-TGSPLIT-NEXT: s_endpgm 4855 i32* %out, i32 %in, i32 %old) { 4856entry: 4857 %gep = getelementptr i32, i32* %out, i32 4 4858 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in seq_cst seq_cst 4859 %val0 = extractvalue { i32, i1 } %val, 0 4860 store i32 %val0, i32* %out, align 4 4861 ret void 4862} 4863 4864define amdgpu_kernel void @flat_system_one_as_unordered_load( 4865; GFX7-LABEL: flat_system_one_as_unordered_load: 4866; GFX7: ; %bb.0: ; %entry 4867; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 4868; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4869; GFX7-NEXT: v_mov_b32_e32 v0, s0 4870; GFX7-NEXT: v_mov_b32_e32 v1, s1 4871; GFX7-NEXT: flat_load_dword v0, v[0:1] 4872; GFX7-NEXT: v_mov_b32_e32 v2, s2 4873; GFX7-NEXT: v_mov_b32_e32 v3, s3 4874; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4875; GFX7-NEXT: flat_store_dword v[2:3], v0 4876; GFX7-NEXT: s_endpgm 4877; 4878; GFX10-WGP-LABEL: flat_system_one_as_unordered_load: 4879; GFX10-WGP: ; %bb.0: ; %entry 4880; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 4881; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4882; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4883; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 4884; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] 4885; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 4886; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 4887; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4888; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 4889; GFX10-WGP-NEXT: s_endpgm 4890; 4891; GFX10-CU-LABEL: flat_system_one_as_unordered_load: 4892; GFX10-CU: ; %bb.0: ; %entry 4893; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 4894; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4895; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 4896; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 4897; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] 4898; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 4899; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 4900; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4901; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 4902; GFX10-CU-NEXT: s_endpgm 4903; 4904; SKIP-CACHE-INV-LABEL: flat_system_one_as_unordered_load: 4905; SKIP-CACHE-INV: ; %bb.0: ; %entry 4906; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 4907; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4908; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 4909; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 4910; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] 4911; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 4912; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 4913; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4914; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 4915; SKIP-CACHE-INV-NEXT: s_endpgm 4916; 4917; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_unordered_load: 4918; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 4919; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 4920; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4921; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 4922; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 4923; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] 4924; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 4925; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 4926; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4927; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 4928; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 4929; 4930; GFX90A-TGSPLIT-LABEL: flat_system_one_as_unordered_load: 4931; GFX90A-TGSPLIT: ; %bb.0: ; %entry 4932; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 4933; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4934; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 4935; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 4936; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] 4937; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 4938; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 4939; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 4940; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 4941; GFX90A-TGSPLIT-NEXT: s_endpgm 4942 i32* %in, i32* %out) { 4943entry: 4944 %val = load atomic i32, i32* %in syncscope("one-as") unordered, align 4 4945 store i32 %val, i32* %out 4946 ret void 4947} 4948 4949define amdgpu_kernel void @flat_system_one_as_monotonic_load( 4950; GFX7-LABEL: flat_system_one_as_monotonic_load: 4951; GFX7: ; %bb.0: ; %entry 4952; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 4953; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4954; GFX7-NEXT: v_mov_b32_e32 v0, s0 4955; GFX7-NEXT: v_mov_b32_e32 v1, s1 4956; GFX7-NEXT: flat_load_dword v0, v[0:1] glc 4957; GFX7-NEXT: v_mov_b32_e32 v2, s2 4958; GFX7-NEXT: v_mov_b32_e32 v3, s3 4959; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4960; GFX7-NEXT: flat_store_dword v[2:3], v0 4961; GFX7-NEXT: s_endpgm 4962; 4963; GFX10-WGP-LABEL: flat_system_one_as_monotonic_load: 4964; GFX10-WGP: ; %bb.0: ; %entry 4965; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 4966; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4967; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4968; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 4969; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc 4970; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 4971; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 4972; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4973; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 4974; GFX10-WGP-NEXT: s_endpgm 4975; 4976; GFX10-CU-LABEL: flat_system_one_as_monotonic_load: 4977; GFX10-CU: ; %bb.0: ; %entry 4978; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 4979; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4980; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 4981; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 4982; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc 4983; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 4984; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 4985; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4986; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 4987; GFX10-CU-NEXT: s_endpgm 4988; 4989; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_load: 4990; SKIP-CACHE-INV: ; %bb.0: ; %entry 4991; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 4992; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4993; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 4994; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 4995; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] glc 4996; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 4997; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 4998; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4999; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 5000; SKIP-CACHE-INV-NEXT: s_endpgm 5001; 5002; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_load: 5003; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 5004; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 5005; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5006; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 5007; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 5008; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc 5009; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 5010; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 5011; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5012; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 5013; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 5014; 5015; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_load: 5016; GFX90A-TGSPLIT: ; %bb.0: ; %entry 5017; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 5018; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5019; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 5020; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 5021; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc 5022; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 5023; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 5024; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 5025; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 5026; GFX90A-TGSPLIT-NEXT: s_endpgm 5027 i32* %in, i32* %out) { 5028entry: 5029 %val = load atomic i32, i32* %in syncscope("one-as") monotonic, align 4 5030 store i32 %val, i32* %out 5031 ret void 5032} 5033 5034define amdgpu_kernel void @flat_system_one_as_acquire_load( 5035; GFX7-LABEL: flat_system_one_as_acquire_load: 5036; GFX7: ; %bb.0: ; %entry 5037; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 5038; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5039; GFX7-NEXT: v_mov_b32_e32 v0, s0 5040; GFX7-NEXT: v_mov_b32_e32 v1, s1 5041; GFX7-NEXT: flat_load_dword v0, v[0:1] glc 5042; GFX7-NEXT: s_waitcnt vmcnt(0) 5043; GFX7-NEXT: buffer_wbinvl1_vol 5044; GFX7-NEXT: v_mov_b32_e32 v2, s2 5045; GFX7-NEXT: v_mov_b32_e32 v3, s3 5046; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5047; GFX7-NEXT: flat_store_dword v[2:3], v0 5048; GFX7-NEXT: s_endpgm 5049; 5050; GFX10-WGP-LABEL: flat_system_one_as_acquire_load: 5051; GFX10-WGP: ; %bb.0: ; %entry 5052; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 5053; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 5054; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 5055; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 5056; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc 5057; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 5058; GFX10-WGP-NEXT: buffer_gl0_inv 5059; GFX10-WGP-NEXT: buffer_gl1_inv 5060; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 5061; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 5062; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 5063; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 5064; GFX10-WGP-NEXT: s_endpgm 5065; 5066; GFX10-CU-LABEL: flat_system_one_as_acquire_load: 5067; GFX10-CU: ; %bb.0: ; %entry 5068; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 5069; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5070; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 5071; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 5072; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc 5073; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 5074; GFX10-CU-NEXT: buffer_gl0_inv 5075; GFX10-CU-NEXT: buffer_gl1_inv 5076; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 5077; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 5078; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5079; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 5080; GFX10-CU-NEXT: s_endpgm 5081; 5082; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_load: 5083; SKIP-CACHE-INV: ; %bb.0: ; %entry 5084; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 5085; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5086; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 5087; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 5088; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] glc 5089; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 5090; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 5091; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 5092; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5093; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 5094; SKIP-CACHE-INV-NEXT: s_endpgm 5095; 5096; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_load: 5097; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 5098; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 5099; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5100; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 5101; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 5102; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc 5103; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 5104; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 5105; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol 5106; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 5107; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 5108; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5109; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 5110; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 5111; 5112; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_load: 5113; GFX90A-TGSPLIT: ; %bb.0: ; %entry 5114; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 5115; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5116; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 5117; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 5118; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc 5119; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 5120; GFX90A-TGSPLIT-NEXT: buffer_invl2 5121; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 5122; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 5123; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 5124; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 5125; GFX90A-TGSPLIT-NEXT: s_endpgm 5126 i32* %in, i32* %out) { 5127entry: 5128 %val = load atomic i32, i32* %in syncscope("one-as") acquire, align 4 5129 store i32 %val, i32* %out 5130 ret void 5131} 5132 5133define amdgpu_kernel void @flat_system_one_as_seq_cst_load( 5134; GFX7-LABEL: flat_system_one_as_seq_cst_load: 5135; GFX7: ; %bb.0: ; %entry 5136; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 5137; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5138; GFX7-NEXT: v_mov_b32_e32 v0, s0 5139; GFX7-NEXT: v_mov_b32_e32 v1, s1 5140; GFX7-NEXT: s_waitcnt vmcnt(0) 5141; GFX7-NEXT: flat_load_dword v0, v[0:1] glc 5142; GFX7-NEXT: s_waitcnt vmcnt(0) 5143; GFX7-NEXT: buffer_wbinvl1_vol 5144; GFX7-NEXT: v_mov_b32_e32 v2, s2 5145; GFX7-NEXT: v_mov_b32_e32 v3, s3 5146; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5147; GFX7-NEXT: flat_store_dword v[2:3], v0 5148; GFX7-NEXT: s_endpgm 5149; 5150; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_load: 5151; GFX10-WGP: ; %bb.0: ; %entry 5152; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 5153; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 5154; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 5155; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 5156; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 5157; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 5158; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc 5159; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 5160; GFX10-WGP-NEXT: buffer_gl0_inv 5161; GFX10-WGP-NEXT: buffer_gl1_inv 5162; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 5163; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 5164; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 5165; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 5166; GFX10-WGP-NEXT: s_endpgm 5167; 5168; GFX10-CU-LABEL: flat_system_one_as_seq_cst_load: 5169; GFX10-CU: ; %bb.0: ; %entry 5170; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 5171; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5172; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 5173; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 5174; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 5175; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 5176; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc 5177; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 5178; GFX10-CU-NEXT: buffer_gl0_inv 5179; GFX10-CU-NEXT: buffer_gl1_inv 5180; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 5181; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 5182; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5183; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 5184; GFX10-CU-NEXT: s_endpgm 5185; 5186; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_load: 5187; SKIP-CACHE-INV: ; %bb.0: ; %entry 5188; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 5189; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5190; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 5191; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 5192; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 5193; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] glc 5194; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 5195; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 5196; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 5197; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5198; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 5199; SKIP-CACHE-INV-NEXT: s_endpgm 5200; 5201; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_load: 5202; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 5203; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 5204; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5205; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 5206; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 5207; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 5208; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc 5209; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 5210; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 5211; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol 5212; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 5213; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 5214; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5215; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 5216; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 5217; 5218; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_load: 5219; GFX90A-TGSPLIT: ; %bb.0: ; %entry 5220; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 5221; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5222; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 5223; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 5224; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 5225; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc 5226; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 5227; GFX90A-TGSPLIT-NEXT: buffer_invl2 5228; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 5229; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 5230; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 5231; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 5232; GFX90A-TGSPLIT-NEXT: s_endpgm 5233 i32* %in, i32* %out) { 5234entry: 5235 %val = load atomic i32, i32* %in syncscope("one-as") seq_cst, align 4 5236 store i32 %val, i32* %out 5237 ret void 5238} 5239 5240define amdgpu_kernel void @flat_system_one_as_unordered_store( 5241; GFX7-LABEL: flat_system_one_as_unordered_store: 5242; GFX7: ; %bb.0: ; %entry 5243; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 5244; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 5245; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5246; GFX7-NEXT: v_mov_b32_e32 v2, s2 5247; GFX7-NEXT: v_mov_b32_e32 v0, s0 5248; GFX7-NEXT: v_mov_b32_e32 v1, s1 5249; GFX7-NEXT: flat_store_dword v[0:1], v2 5250; GFX7-NEXT: s_endpgm 5251; 5252; GFX10-WGP-LABEL: flat_system_one_as_unordered_store: 5253; GFX10-WGP: ; %bb.0: ; %entry 5254; GFX10-WGP-NEXT: s_clause 0x1 5255; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 5256; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 5257; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 5258; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 5259; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 5260; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 5261; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 5262; GFX10-WGP-NEXT: s_endpgm 5263; 5264; GFX10-CU-LABEL: flat_system_one_as_unordered_store: 5265; GFX10-CU: ; %bb.0: ; %entry 5266; GFX10-CU-NEXT: s_clause 0x1 5267; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 5268; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 5269; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5270; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 5271; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 5272; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 5273; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 5274; GFX10-CU-NEXT: s_endpgm 5275; 5276; SKIP-CACHE-INV-LABEL: flat_system_one_as_unordered_store: 5277; SKIP-CACHE-INV: ; %bb.0: ; %entry 5278; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 5279; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 5280; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5281; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 5282; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 5283; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 5284; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 5285; SKIP-CACHE-INV-NEXT: s_endpgm 5286; 5287; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_unordered_store: 5288; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 5289; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 5290; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 5291; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5292; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 5293; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 5294; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 5295; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 5296; 5297; GFX90A-TGSPLIT-LABEL: flat_system_one_as_unordered_store: 5298; GFX90A-TGSPLIT: ; %bb.0: ; %entry 5299; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 5300; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 5301; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5302; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 5303; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 5304; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 5305; GFX90A-TGSPLIT-NEXT: s_endpgm 5306 i32 %in, i32* %out) { 5307entry: 5308 store atomic i32 %in, i32* %out syncscope("one-as") unordered, align 4 5309 ret void 5310} 5311 5312define amdgpu_kernel void @flat_system_one_as_monotonic_store( 5313; GFX7-LABEL: flat_system_one_as_monotonic_store: 5314; GFX7: ; %bb.0: ; %entry 5315; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 5316; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 5317; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5318; GFX7-NEXT: v_mov_b32_e32 v2, s2 5319; GFX7-NEXT: v_mov_b32_e32 v0, s0 5320; GFX7-NEXT: v_mov_b32_e32 v1, s1 5321; GFX7-NEXT: flat_store_dword v[0:1], v2 5322; GFX7-NEXT: s_endpgm 5323; 5324; GFX10-WGP-LABEL: flat_system_one_as_monotonic_store: 5325; GFX10-WGP: ; %bb.0: ; %entry 5326; GFX10-WGP-NEXT: s_clause 0x1 5327; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 5328; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 5329; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 5330; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 5331; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 5332; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 5333; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 5334; GFX10-WGP-NEXT: s_endpgm 5335; 5336; GFX10-CU-LABEL: flat_system_one_as_monotonic_store: 5337; GFX10-CU: ; %bb.0: ; %entry 5338; GFX10-CU-NEXT: s_clause 0x1 5339; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 5340; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 5341; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5342; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 5343; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 5344; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 5345; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 5346; GFX10-CU-NEXT: s_endpgm 5347; 5348; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_store: 5349; SKIP-CACHE-INV: ; %bb.0: ; %entry 5350; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 5351; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 5352; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5353; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 5354; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 5355; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 5356; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 5357; SKIP-CACHE-INV-NEXT: s_endpgm 5358; 5359; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_store: 5360; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 5361; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 5362; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 5363; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5364; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 5365; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 5366; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 5367; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 5368; 5369; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_store: 5370; GFX90A-TGSPLIT: ; %bb.0: ; %entry 5371; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 5372; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 5373; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5374; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 5375; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 5376; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 5377; GFX90A-TGSPLIT-NEXT: s_endpgm 5378 i32 %in, i32* %out) { 5379entry: 5380 store atomic i32 %in, i32* %out syncscope("one-as") monotonic, align 4 5381 ret void 5382} 5383 5384define amdgpu_kernel void @flat_system_one_as_release_store( 5385; GFX7-LABEL: flat_system_one_as_release_store: 5386; GFX7: ; %bb.0: ; %entry 5387; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 5388; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 5389; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5390; GFX7-NEXT: v_mov_b32_e32 v2, s2 5391; GFX7-NEXT: v_mov_b32_e32 v0, s0 5392; GFX7-NEXT: v_mov_b32_e32 v1, s1 5393; GFX7-NEXT: s_waitcnt vmcnt(0) 5394; GFX7-NEXT: flat_store_dword v[0:1], v2 5395; GFX7-NEXT: s_endpgm 5396; 5397; GFX10-WGP-LABEL: flat_system_one_as_release_store: 5398; GFX10-WGP: ; %bb.0: ; %entry 5399; GFX10-WGP-NEXT: s_clause 0x1 5400; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 5401; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 5402; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 5403; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 5404; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 5405; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 5406; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 5407; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 5408; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 5409; GFX10-WGP-NEXT: s_endpgm 5410; 5411; GFX10-CU-LABEL: flat_system_one_as_release_store: 5412; GFX10-CU: ; %bb.0: ; %entry 5413; GFX10-CU-NEXT: s_clause 0x1 5414; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 5415; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 5416; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5417; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 5418; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 5419; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 5420; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 5421; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 5422; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 5423; GFX10-CU-NEXT: s_endpgm 5424; 5425; SKIP-CACHE-INV-LABEL: flat_system_one_as_release_store: 5426; SKIP-CACHE-INV: ; %bb.0: ; %entry 5427; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 5428; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 5429; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5430; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 5431; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 5432; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 5433; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 5434; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 5435; SKIP-CACHE-INV-NEXT: s_endpgm 5436; 5437; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_store: 5438; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 5439; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 5440; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 5441; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5442; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 5443; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 5444; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 5445; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 5446; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 5447; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 5448; 5449; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_store: 5450; GFX90A-TGSPLIT: ; %bb.0: ; %entry 5451; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 5452; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 5453; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5454; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 5455; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 5456; GFX90A-TGSPLIT-NEXT: buffer_wbl2 5457; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 5458; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 5459; GFX90A-TGSPLIT-NEXT: s_endpgm 5460 i32 %in, i32* %out) { 5461entry: 5462 store atomic i32 %in, i32* %out syncscope("one-as") release, align 4 5463 ret void 5464} 5465 5466define amdgpu_kernel void @flat_system_one_as_seq_cst_store( 5467; GFX7-LABEL: flat_system_one_as_seq_cst_store: 5468; GFX7: ; %bb.0: ; %entry 5469; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 5470; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 5471; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5472; GFX7-NEXT: v_mov_b32_e32 v2, s2 5473; GFX7-NEXT: v_mov_b32_e32 v0, s0 5474; GFX7-NEXT: v_mov_b32_e32 v1, s1 5475; GFX7-NEXT: s_waitcnt vmcnt(0) 5476; GFX7-NEXT: flat_store_dword v[0:1], v2 5477; GFX7-NEXT: s_endpgm 5478; 5479; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_store: 5480; GFX10-WGP: ; %bb.0: ; %entry 5481; GFX10-WGP-NEXT: s_clause 0x1 5482; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 5483; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 5484; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 5485; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 5486; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 5487; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 5488; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 5489; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 5490; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 5491; GFX10-WGP-NEXT: s_endpgm 5492; 5493; GFX10-CU-LABEL: flat_system_one_as_seq_cst_store: 5494; GFX10-CU: ; %bb.0: ; %entry 5495; GFX10-CU-NEXT: s_clause 0x1 5496; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 5497; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 5498; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5499; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 5500; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 5501; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 5502; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 5503; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 5504; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 5505; GFX10-CU-NEXT: s_endpgm 5506; 5507; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_store: 5508; SKIP-CACHE-INV: ; %bb.0: ; %entry 5509; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 5510; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 5511; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5512; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 5513; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 5514; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 5515; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 5516; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 5517; SKIP-CACHE-INV-NEXT: s_endpgm 5518; 5519; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_store: 5520; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 5521; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 5522; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 5523; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5524; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 5525; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 5526; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 5527; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 5528; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 5529; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 5530; 5531; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_store: 5532; GFX90A-TGSPLIT: ; %bb.0: ; %entry 5533; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 5534; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 5535; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5536; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 5537; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 5538; GFX90A-TGSPLIT-NEXT: buffer_wbl2 5539; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 5540; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 5541; GFX90A-TGSPLIT-NEXT: s_endpgm 5542 i32 %in, i32* %out) { 5543entry: 5544 store atomic i32 %in, i32* %out syncscope("one-as") seq_cst, align 4 5545 ret void 5546} 5547 5548define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw( 5549; GFX7-LABEL: flat_system_one_as_monotonic_atomicrmw: 5550; GFX7: ; %bb.0: ; %entry 5551; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5552; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 5553; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5554; GFX7-NEXT: v_mov_b32_e32 v0, s0 5555; GFX7-NEXT: v_mov_b32_e32 v1, s1 5556; GFX7-NEXT: v_mov_b32_e32 v2, s2 5557; GFX7-NEXT: flat_atomic_swap v[0:1], v2 5558; GFX7-NEXT: s_endpgm 5559; 5560; GFX10-WGP-LABEL: flat_system_one_as_monotonic_atomicrmw: 5561; GFX10-WGP: ; %bb.0: ; %entry 5562; GFX10-WGP-NEXT: s_clause 0x1 5563; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5564; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 5565; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 5566; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 5567; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 5568; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 5569; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 5570; GFX10-WGP-NEXT: s_endpgm 5571; 5572; GFX10-CU-LABEL: flat_system_one_as_monotonic_atomicrmw: 5573; GFX10-CU: ; %bb.0: ; %entry 5574; GFX10-CU-NEXT: s_clause 0x1 5575; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5576; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 5577; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5578; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 5579; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 5580; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 5581; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 5582; GFX10-CU-NEXT: s_endpgm 5583; 5584; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_atomicrmw: 5585; SKIP-CACHE-INV: ; %bb.0: ; %entry 5586; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 5587; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 5588; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5589; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 5590; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 5591; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 5592; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 5593; SKIP-CACHE-INV-NEXT: s_endpgm 5594; 5595; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_atomicrmw: 5596; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 5597; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5598; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 5599; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5600; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 5601; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 5602; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 5603; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 5604; 5605; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_atomicrmw: 5606; GFX90A-TGSPLIT: ; %bb.0: ; %entry 5607; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5608; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 5609; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5610; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 5611; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 5612; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 5613; GFX90A-TGSPLIT-NEXT: s_endpgm 5614 i32* %out, i32 %in) { 5615entry: 5616 %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") monotonic 5617 ret void 5618} 5619 5620define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw( 5621; GFX7-LABEL: flat_system_one_as_acquire_atomicrmw: 5622; GFX7: ; %bb.0: ; %entry 5623; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5624; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 5625; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5626; GFX7-NEXT: v_mov_b32_e32 v0, s0 5627; GFX7-NEXT: v_mov_b32_e32 v1, s1 5628; GFX7-NEXT: v_mov_b32_e32 v2, s2 5629; GFX7-NEXT: flat_atomic_swap v[0:1], v2 5630; GFX7-NEXT: s_waitcnt vmcnt(0) 5631; GFX7-NEXT: buffer_wbinvl1_vol 5632; GFX7-NEXT: s_endpgm 5633; 5634; GFX10-WGP-LABEL: flat_system_one_as_acquire_atomicrmw: 5635; GFX10-WGP: ; %bb.0: ; %entry 5636; GFX10-WGP-NEXT: s_clause 0x1 5637; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5638; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 5639; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 5640; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 5641; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 5642; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 5643; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 5644; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 5645; GFX10-WGP-NEXT: buffer_gl0_inv 5646; GFX10-WGP-NEXT: buffer_gl1_inv 5647; GFX10-WGP-NEXT: s_endpgm 5648; 5649; GFX10-CU-LABEL: flat_system_one_as_acquire_atomicrmw: 5650; GFX10-CU: ; %bb.0: ; %entry 5651; GFX10-CU-NEXT: s_clause 0x1 5652; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5653; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 5654; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5655; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 5656; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 5657; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 5658; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 5659; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 5660; GFX10-CU-NEXT: buffer_gl0_inv 5661; GFX10-CU-NEXT: buffer_gl1_inv 5662; GFX10-CU-NEXT: s_endpgm 5663; 5664; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_atomicrmw: 5665; SKIP-CACHE-INV: ; %bb.0: ; %entry 5666; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 5667; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 5668; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5669; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 5670; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 5671; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 5672; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 5673; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 5674; SKIP-CACHE-INV-NEXT: s_endpgm 5675; 5676; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_atomicrmw: 5677; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 5678; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5679; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 5680; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5681; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 5682; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 5683; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 5684; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 5685; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 5686; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol 5687; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 5688; 5689; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_atomicrmw: 5690; GFX90A-TGSPLIT: ; %bb.0: ; %entry 5691; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5692; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 5693; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5694; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 5695; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 5696; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 5697; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 5698; GFX90A-TGSPLIT-NEXT: buffer_invl2 5699; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 5700; GFX90A-TGSPLIT-NEXT: s_endpgm 5701 i32* %out, i32 %in) { 5702entry: 5703 %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") acquire 5704 ret void 5705} 5706 5707define amdgpu_kernel void @flat_system_one_as_release_atomicrmw( 5708; GFX7-LABEL: flat_system_one_as_release_atomicrmw: 5709; GFX7: ; %bb.0: ; %entry 5710; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5711; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 5712; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5713; GFX7-NEXT: v_mov_b32_e32 v0, s0 5714; GFX7-NEXT: v_mov_b32_e32 v1, s1 5715; GFX7-NEXT: v_mov_b32_e32 v2, s2 5716; GFX7-NEXT: s_waitcnt vmcnt(0) 5717; GFX7-NEXT: flat_atomic_swap v[0:1], v2 5718; GFX7-NEXT: s_endpgm 5719; 5720; GFX10-WGP-LABEL: flat_system_one_as_release_atomicrmw: 5721; GFX10-WGP: ; %bb.0: ; %entry 5722; GFX10-WGP-NEXT: s_clause 0x1 5723; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5724; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 5725; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 5726; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 5727; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 5728; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 5729; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 5730; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 5731; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 5732; GFX10-WGP-NEXT: s_endpgm 5733; 5734; GFX10-CU-LABEL: flat_system_one_as_release_atomicrmw: 5735; GFX10-CU: ; %bb.0: ; %entry 5736; GFX10-CU-NEXT: s_clause 0x1 5737; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5738; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 5739; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5740; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 5741; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 5742; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 5743; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 5744; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 5745; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 5746; GFX10-CU-NEXT: s_endpgm 5747; 5748; SKIP-CACHE-INV-LABEL: flat_system_one_as_release_atomicrmw: 5749; SKIP-CACHE-INV: ; %bb.0: ; %entry 5750; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 5751; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 5752; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5753; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 5754; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 5755; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 5756; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 5757; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 5758; SKIP-CACHE-INV-NEXT: s_endpgm 5759; 5760; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_atomicrmw: 5761; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 5762; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5763; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 5764; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5765; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 5766; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 5767; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 5768; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 5769; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 5770; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 5771; 5772; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_atomicrmw: 5773; GFX90A-TGSPLIT: ; %bb.0: ; %entry 5774; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5775; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 5776; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5777; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 5778; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 5779; GFX90A-TGSPLIT-NEXT: buffer_wbl2 5780; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 5781; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 5782; GFX90A-TGSPLIT-NEXT: s_endpgm 5783 i32* %out, i32 %in) { 5784entry: 5785 %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") release 5786 ret void 5787} 5788 5789define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw( 5790; GFX7-LABEL: flat_system_one_as_acq_rel_atomicrmw: 5791; GFX7: ; %bb.0: ; %entry 5792; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5793; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 5794; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5795; GFX7-NEXT: v_mov_b32_e32 v0, s0 5796; GFX7-NEXT: v_mov_b32_e32 v1, s1 5797; GFX7-NEXT: v_mov_b32_e32 v2, s2 5798; GFX7-NEXT: s_waitcnt vmcnt(0) 5799; GFX7-NEXT: flat_atomic_swap v[0:1], v2 5800; GFX7-NEXT: s_waitcnt vmcnt(0) 5801; GFX7-NEXT: buffer_wbinvl1_vol 5802; GFX7-NEXT: s_endpgm 5803; 5804; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_atomicrmw: 5805; GFX10-WGP: ; %bb.0: ; %entry 5806; GFX10-WGP-NEXT: s_clause 0x1 5807; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5808; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 5809; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 5810; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 5811; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 5812; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 5813; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 5814; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 5815; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 5816; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 5817; GFX10-WGP-NEXT: buffer_gl0_inv 5818; GFX10-WGP-NEXT: buffer_gl1_inv 5819; GFX10-WGP-NEXT: s_endpgm 5820; 5821; GFX10-CU-LABEL: flat_system_one_as_acq_rel_atomicrmw: 5822; GFX10-CU: ; %bb.0: ; %entry 5823; GFX10-CU-NEXT: s_clause 0x1 5824; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5825; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 5826; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5827; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 5828; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 5829; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 5830; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 5831; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 5832; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 5833; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 5834; GFX10-CU-NEXT: buffer_gl0_inv 5835; GFX10-CU-NEXT: buffer_gl1_inv 5836; GFX10-CU-NEXT: s_endpgm 5837; 5838; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_atomicrmw: 5839; SKIP-CACHE-INV: ; %bb.0: ; %entry 5840; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 5841; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 5842; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5843; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 5844; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 5845; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 5846; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 5847; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 5848; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 5849; SKIP-CACHE-INV-NEXT: s_endpgm 5850; 5851; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_atomicrmw: 5852; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 5853; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5854; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 5855; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5856; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 5857; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 5858; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 5859; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 5860; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 5861; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 5862; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 5863; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol 5864; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 5865; 5866; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_atomicrmw: 5867; GFX90A-TGSPLIT: ; %bb.0: ; %entry 5868; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5869; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 5870; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5871; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 5872; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 5873; GFX90A-TGSPLIT-NEXT: buffer_wbl2 5874; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 5875; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 5876; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 5877; GFX90A-TGSPLIT-NEXT: buffer_invl2 5878; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 5879; GFX90A-TGSPLIT-NEXT: s_endpgm 5880 i32* %out, i32 %in) { 5881entry: 5882 %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") acq_rel 5883 ret void 5884} 5885 5886define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw( 5887; GFX7-LABEL: flat_system_one_as_seq_cst_atomicrmw: 5888; GFX7: ; %bb.0: ; %entry 5889; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5890; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 5891; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5892; GFX7-NEXT: v_mov_b32_e32 v0, s0 5893; GFX7-NEXT: v_mov_b32_e32 v1, s1 5894; GFX7-NEXT: v_mov_b32_e32 v2, s2 5895; GFX7-NEXT: s_waitcnt vmcnt(0) 5896; GFX7-NEXT: flat_atomic_swap v[0:1], v2 5897; GFX7-NEXT: s_waitcnt vmcnt(0) 5898; GFX7-NEXT: buffer_wbinvl1_vol 5899; GFX7-NEXT: s_endpgm 5900; 5901; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_atomicrmw: 5902; GFX10-WGP: ; %bb.0: ; %entry 5903; GFX10-WGP-NEXT: s_clause 0x1 5904; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5905; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 5906; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 5907; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 5908; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 5909; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 5910; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 5911; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 5912; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 5913; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 5914; GFX10-WGP-NEXT: buffer_gl0_inv 5915; GFX10-WGP-NEXT: buffer_gl1_inv 5916; GFX10-WGP-NEXT: s_endpgm 5917; 5918; GFX10-CU-LABEL: flat_system_one_as_seq_cst_atomicrmw: 5919; GFX10-CU: ; %bb.0: ; %entry 5920; GFX10-CU-NEXT: s_clause 0x1 5921; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5922; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 5923; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5924; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 5925; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 5926; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 5927; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 5928; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 5929; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 5930; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 5931; GFX10-CU-NEXT: buffer_gl0_inv 5932; GFX10-CU-NEXT: buffer_gl1_inv 5933; GFX10-CU-NEXT: s_endpgm 5934; 5935; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_atomicrmw: 5936; SKIP-CACHE-INV: ; %bb.0: ; %entry 5937; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 5938; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 5939; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5940; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 5941; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 5942; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 5943; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 5944; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 5945; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 5946; SKIP-CACHE-INV-NEXT: s_endpgm 5947; 5948; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_atomicrmw: 5949; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 5950; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5951; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 5952; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5953; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 5954; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 5955; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 5956; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 5957; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 5958; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 5959; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 5960; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol 5961; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 5962; 5963; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_atomicrmw: 5964; GFX90A-TGSPLIT: ; %bb.0: ; %entry 5965; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5966; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 5967; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5968; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 5969; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 5970; GFX90A-TGSPLIT-NEXT: buffer_wbl2 5971; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 5972; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 5973; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 5974; GFX90A-TGSPLIT-NEXT: buffer_invl2 5975; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 5976; GFX90A-TGSPLIT-NEXT: s_endpgm 5977 i32* %out, i32 %in) { 5978entry: 5979 %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") seq_cst 5980 ret void 5981} 5982 5983define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw( 5984; GFX7-LABEL: flat_system_one_as_acquire_ret_atomicrmw: 5985; GFX7: ; %bb.0: ; %entry 5986; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5987; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 5988; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5989; GFX7-NEXT: v_mov_b32_e32 v0, s0 5990; GFX7-NEXT: v_mov_b32_e32 v1, s1 5991; GFX7-NEXT: v_mov_b32_e32 v2, s2 5992; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 5993; GFX7-NEXT: s_waitcnt vmcnt(0) 5994; GFX7-NEXT: buffer_wbinvl1_vol 5995; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5996; GFX7-NEXT: flat_store_dword v[0:1], v2 5997; GFX7-NEXT: s_endpgm 5998; 5999; GFX10-WGP-LABEL: flat_system_one_as_acquire_ret_atomicrmw: 6000; GFX10-WGP: ; %bb.0: ; %entry 6001; GFX10-WGP-NEXT: s_clause 0x1 6002; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6003; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 6004; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 6005; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 6006; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 6007; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 6008; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 6009; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 6010; GFX10-WGP-NEXT: buffer_gl0_inv 6011; GFX10-WGP-NEXT: buffer_gl1_inv 6012; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 6013; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 6014; GFX10-WGP-NEXT: s_endpgm 6015; 6016; GFX10-CU-LABEL: flat_system_one_as_acquire_ret_atomicrmw: 6017; GFX10-CU: ; %bb.0: ; %entry 6018; GFX10-CU-NEXT: s_clause 0x1 6019; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6020; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 6021; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 6022; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 6023; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 6024; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 6025; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 6026; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 6027; GFX10-CU-NEXT: buffer_gl0_inv 6028; GFX10-CU-NEXT: buffer_gl1_inv 6029; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 6030; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 6031; GFX10-CU-NEXT: s_endpgm 6032; 6033; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_ret_atomicrmw: 6034; SKIP-CACHE-INV: ; %bb.0: ; %entry 6035; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 6036; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 6037; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 6038; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 6039; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 6040; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 6041; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 6042; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6043; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 6044; SKIP-CACHE-INV-NEXT: s_endpgm 6045; 6046; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_ret_atomicrmw: 6047; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 6048; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6049; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 6050; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6051; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 6052; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 6053; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 6054; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 6055; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 6056; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol 6057; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6058; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 6059; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 6060; 6061; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_ret_atomicrmw: 6062; GFX90A-TGSPLIT: ; %bb.0: ; %entry 6063; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6064; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 6065; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6066; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 6067; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 6068; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 6069; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 6070; GFX90A-TGSPLIT-NEXT: buffer_invl2 6071; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 6072; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 6073; GFX90A-TGSPLIT-NEXT: s_endpgm 6074 i32* %out, i32 %in) { 6075entry: 6076 %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") acquire 6077 store i32 %val, i32* %out, align 4 6078 ret void 6079} 6080 6081define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw( 6082; GFX7-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: 6083; GFX7: ; %bb.0: ; %entry 6084; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6085; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 6086; GFX7-NEXT: s_waitcnt lgkmcnt(0) 6087; GFX7-NEXT: v_mov_b32_e32 v0, s0 6088; GFX7-NEXT: v_mov_b32_e32 v1, s1 6089; GFX7-NEXT: v_mov_b32_e32 v2, s2 6090; GFX7-NEXT: s_waitcnt vmcnt(0) 6091; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 6092; GFX7-NEXT: s_waitcnt vmcnt(0) 6093; GFX7-NEXT: buffer_wbinvl1_vol 6094; GFX7-NEXT: s_waitcnt lgkmcnt(0) 6095; GFX7-NEXT: flat_store_dword v[0:1], v2 6096; GFX7-NEXT: s_endpgm 6097; 6098; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: 6099; GFX10-WGP: ; %bb.0: ; %entry 6100; GFX10-WGP-NEXT: s_clause 0x1 6101; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6102; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 6103; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 6104; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 6105; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 6106; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 6107; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 6108; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 6109; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 6110; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 6111; GFX10-WGP-NEXT: buffer_gl0_inv 6112; GFX10-WGP-NEXT: buffer_gl1_inv 6113; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 6114; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 6115; GFX10-WGP-NEXT: s_endpgm 6116; 6117; GFX10-CU-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: 6118; GFX10-CU: ; %bb.0: ; %entry 6119; GFX10-CU-NEXT: s_clause 0x1 6120; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6121; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 6122; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 6123; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 6124; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 6125; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 6126; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 6127; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 6128; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 6129; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 6130; GFX10-CU-NEXT: buffer_gl0_inv 6131; GFX10-CU-NEXT: buffer_gl1_inv 6132; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 6133; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 6134; GFX10-CU-NEXT: s_endpgm 6135; 6136; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: 6137; SKIP-CACHE-INV: ; %bb.0: ; %entry 6138; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 6139; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 6140; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 6141; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 6142; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 6143; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 6144; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 6145; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 6146; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6147; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 6148; SKIP-CACHE-INV-NEXT: s_endpgm 6149; 6150; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: 6151; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 6152; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6153; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 6154; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6155; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 6156; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 6157; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 6158; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 6159; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 6160; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 6161; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 6162; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol 6163; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6164; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 6165; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 6166; 6167; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: 6168; GFX90A-TGSPLIT: ; %bb.0: ; %entry 6169; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6170; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 6171; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6172; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 6173; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 6174; GFX90A-TGSPLIT-NEXT: buffer_wbl2 6175; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 6176; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 6177; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 6178; GFX90A-TGSPLIT-NEXT: buffer_invl2 6179; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 6180; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 6181; GFX90A-TGSPLIT-NEXT: s_endpgm 6182 i32* %out, i32 %in) { 6183entry: 6184 %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") acq_rel 6185 store i32 %val, i32* %out, align 4 6186 ret void 6187} 6188 6189define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw( 6190; GFX7-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: 6191; GFX7: ; %bb.0: ; %entry 6192; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6193; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 6194; GFX7-NEXT: s_waitcnt lgkmcnt(0) 6195; GFX7-NEXT: v_mov_b32_e32 v0, s0 6196; GFX7-NEXT: v_mov_b32_e32 v1, s1 6197; GFX7-NEXT: v_mov_b32_e32 v2, s2 6198; GFX7-NEXT: s_waitcnt vmcnt(0) 6199; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 6200; GFX7-NEXT: s_waitcnt vmcnt(0) 6201; GFX7-NEXT: buffer_wbinvl1_vol 6202; GFX7-NEXT: s_waitcnt lgkmcnt(0) 6203; GFX7-NEXT: flat_store_dword v[0:1], v2 6204; GFX7-NEXT: s_endpgm 6205; 6206; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: 6207; GFX10-WGP: ; %bb.0: ; %entry 6208; GFX10-WGP-NEXT: s_clause 0x1 6209; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6210; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 6211; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 6212; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 6213; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 6214; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 6215; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 6216; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 6217; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 6218; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 6219; GFX10-WGP-NEXT: buffer_gl0_inv 6220; GFX10-WGP-NEXT: buffer_gl1_inv 6221; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 6222; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 6223; GFX10-WGP-NEXT: s_endpgm 6224; 6225; GFX10-CU-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: 6226; GFX10-CU: ; %bb.0: ; %entry 6227; GFX10-CU-NEXT: s_clause 0x1 6228; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6229; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 6230; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 6231; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 6232; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 6233; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 6234; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 6235; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 6236; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 6237; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 6238; GFX10-CU-NEXT: buffer_gl0_inv 6239; GFX10-CU-NEXT: buffer_gl1_inv 6240; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 6241; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 6242; GFX10-CU-NEXT: s_endpgm 6243; 6244; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: 6245; SKIP-CACHE-INV: ; %bb.0: ; %entry 6246; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 6247; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 6248; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 6249; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 6250; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 6251; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 6252; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 6253; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 6254; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6255; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 6256; SKIP-CACHE-INV-NEXT: s_endpgm 6257; 6258; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: 6259; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 6260; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6261; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 6262; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6263; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 6264; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 6265; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 6266; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 6267; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 6268; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 6269; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 6270; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol 6271; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6272; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 6273; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 6274; 6275; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: 6276; GFX90A-TGSPLIT: ; %bb.0: ; %entry 6277; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6278; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 6279; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6280; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 6281; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 6282; GFX90A-TGSPLIT-NEXT: buffer_wbl2 6283; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 6284; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 6285; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 6286; GFX90A-TGSPLIT-NEXT: buffer_invl2 6287; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 6288; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 6289; GFX90A-TGSPLIT-NEXT: s_endpgm 6290 i32* %out, i32 %in) { 6291entry: 6292 %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") seq_cst 6293 store i32 %val, i32* %out, align 4 6294 ret void 6295} 6296 6297define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg( 6298; GFX7-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: 6299; GFX7: ; %bb.0: ; %entry 6300; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6301; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 6302; GFX7-NEXT: s_waitcnt lgkmcnt(0) 6303; GFX7-NEXT: s_add_u32 s0, s0, 16 6304; GFX7-NEXT: s_addc_u32 s1, s1, 0 6305; GFX7-NEXT: v_mov_b32_e32 v0, s0 6306; GFX7-NEXT: v_mov_b32_e32 v2, s2 6307; GFX7-NEXT: v_mov_b32_e32 v1, s1 6308; GFX7-NEXT: v_mov_b32_e32 v3, s3 6309; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 6310; GFX7-NEXT: s_endpgm 6311; 6312; GFX10-WGP-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: 6313; GFX10-WGP: ; %bb.0: ; %entry 6314; GFX10-WGP-NEXT: s_clause 0x1 6315; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6316; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6317; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 6318; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 6319; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 6320; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 6321; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 6322; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 6323; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 6324; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 6325; GFX10-WGP-NEXT: s_endpgm 6326; 6327; GFX10-CU-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: 6328; GFX10-CU: ; %bb.0: ; %entry 6329; GFX10-CU-NEXT: s_clause 0x1 6330; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6331; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6332; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 6333; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 6334; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 6335; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 6336; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 6337; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 6338; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 6339; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 6340; GFX10-CU-NEXT: s_endpgm 6341; 6342; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: 6343; SKIP-CACHE-INV: ; %bb.0: ; %entry 6344; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 6345; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 6346; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 6347; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 6348; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 6349; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 6350; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 6351; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 6352; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 6353; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 6354; SKIP-CACHE-INV-NEXT: s_endpgm 6355; 6356; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: 6357; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 6358; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6359; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6360; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6361; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 6362; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 6363; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 6364; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 6365; 6366; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: 6367; GFX90A-TGSPLIT: ; %bb.0: ; %entry 6368; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6369; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6370; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6371; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 6372; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 6373; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 6374; GFX90A-TGSPLIT-NEXT: s_endpgm 6375 i32* %out, i32 %in, i32 %old) { 6376entry: 6377 %gep = getelementptr i32, i32* %out, i32 4 6378 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") monotonic monotonic 6379 ret void 6380} 6381 6382define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( 6383; GFX7-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: 6384; GFX7: ; %bb.0: ; %entry 6385; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6386; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 6387; GFX7-NEXT: s_waitcnt lgkmcnt(0) 6388; GFX7-NEXT: s_add_u32 s0, s0, 16 6389; GFX7-NEXT: s_addc_u32 s1, s1, 0 6390; GFX7-NEXT: v_mov_b32_e32 v0, s0 6391; GFX7-NEXT: v_mov_b32_e32 v2, s2 6392; GFX7-NEXT: v_mov_b32_e32 v1, s1 6393; GFX7-NEXT: v_mov_b32_e32 v3, s3 6394; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 6395; GFX7-NEXT: s_waitcnt vmcnt(0) 6396; GFX7-NEXT: buffer_wbinvl1_vol 6397; GFX7-NEXT: s_endpgm 6398; 6399; GFX10-WGP-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: 6400; GFX10-WGP: ; %bb.0: ; %entry 6401; GFX10-WGP-NEXT: s_clause 0x1 6402; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6403; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6404; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 6405; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 6406; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 6407; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 6408; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 6409; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 6410; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 6411; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 6412; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 6413; GFX10-WGP-NEXT: buffer_gl0_inv 6414; GFX10-WGP-NEXT: buffer_gl1_inv 6415; GFX10-WGP-NEXT: s_endpgm 6416; 6417; GFX10-CU-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: 6418; GFX10-CU: ; %bb.0: ; %entry 6419; GFX10-CU-NEXT: s_clause 0x1 6420; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6421; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6422; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 6423; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 6424; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 6425; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 6426; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 6427; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 6428; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 6429; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 6430; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 6431; GFX10-CU-NEXT: buffer_gl0_inv 6432; GFX10-CU-NEXT: buffer_gl1_inv 6433; GFX10-CU-NEXT: s_endpgm 6434; 6435; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: 6436; SKIP-CACHE-INV: ; %bb.0: ; %entry 6437; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 6438; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 6439; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 6440; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 6441; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 6442; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 6443; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 6444; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 6445; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 6446; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 6447; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 6448; SKIP-CACHE-INV-NEXT: s_endpgm 6449; 6450; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: 6451; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 6452; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6453; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6454; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6455; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 6456; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 6457; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 6458; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 6459; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 6460; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol 6461; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 6462; 6463; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: 6464; GFX90A-TGSPLIT: ; %bb.0: ; %entry 6465; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6466; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6467; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6468; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 6469; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 6470; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 6471; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 6472; GFX90A-TGSPLIT-NEXT: buffer_invl2 6473; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 6474; GFX90A-TGSPLIT-NEXT: s_endpgm 6475 i32* %out, i32 %in, i32 %old) { 6476entry: 6477 %gep = getelementptr i32, i32* %out, i32 4 6478 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acquire monotonic 6479 ret void 6480} 6481 6482define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg( 6483; GFX7-LABEL: flat_system_one_as_release_monotonic_cmpxchg: 6484; GFX7: ; %bb.0: ; %entry 6485; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6486; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 6487; GFX7-NEXT: s_waitcnt lgkmcnt(0) 6488; GFX7-NEXT: s_add_u32 s0, s0, 16 6489; GFX7-NEXT: s_addc_u32 s1, s1, 0 6490; GFX7-NEXT: v_mov_b32_e32 v0, s0 6491; GFX7-NEXT: v_mov_b32_e32 v2, s2 6492; GFX7-NEXT: v_mov_b32_e32 v1, s1 6493; GFX7-NEXT: v_mov_b32_e32 v3, s3 6494; GFX7-NEXT: s_waitcnt vmcnt(0) 6495; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 6496; GFX7-NEXT: s_endpgm 6497; 6498; GFX10-WGP-LABEL: flat_system_one_as_release_monotonic_cmpxchg: 6499; GFX10-WGP: ; %bb.0: ; %entry 6500; GFX10-WGP-NEXT: s_clause 0x1 6501; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6502; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6503; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 6504; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 6505; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 6506; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 6507; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 6508; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 6509; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 6510; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 6511; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 6512; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 6513; GFX10-WGP-NEXT: s_endpgm 6514; 6515; GFX10-CU-LABEL: flat_system_one_as_release_monotonic_cmpxchg: 6516; GFX10-CU: ; %bb.0: ; %entry 6517; GFX10-CU-NEXT: s_clause 0x1 6518; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6519; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6520; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 6521; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 6522; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 6523; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 6524; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 6525; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 6526; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 6527; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 6528; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 6529; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 6530; GFX10-CU-NEXT: s_endpgm 6531; 6532; SKIP-CACHE-INV-LABEL: flat_system_one_as_release_monotonic_cmpxchg: 6533; SKIP-CACHE-INV: ; %bb.0: ; %entry 6534; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 6535; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 6536; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 6537; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 6538; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 6539; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 6540; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 6541; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 6542; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 6543; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 6544; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 6545; SKIP-CACHE-INV-NEXT: s_endpgm 6546; 6547; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_monotonic_cmpxchg: 6548; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 6549; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6550; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6551; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6552; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 6553; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 6554; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 6555; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 6556; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 6557; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 6558; 6559; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_monotonic_cmpxchg: 6560; GFX90A-TGSPLIT: ; %bb.0: ; %entry 6561; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6562; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6563; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6564; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 6565; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 6566; GFX90A-TGSPLIT-NEXT: buffer_wbl2 6567; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 6568; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 6569; GFX90A-TGSPLIT-NEXT: s_endpgm 6570 i32* %out, i32 %in, i32 %old) { 6571entry: 6572 %gep = getelementptr i32, i32* %out, i32 4 6573 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") release monotonic 6574 ret void 6575} 6576 6577define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( 6578; GFX7-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: 6579; GFX7: ; %bb.0: ; %entry 6580; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6581; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 6582; GFX7-NEXT: s_waitcnt lgkmcnt(0) 6583; GFX7-NEXT: s_add_u32 s0, s0, 16 6584; GFX7-NEXT: s_addc_u32 s1, s1, 0 6585; GFX7-NEXT: v_mov_b32_e32 v0, s0 6586; GFX7-NEXT: v_mov_b32_e32 v2, s2 6587; GFX7-NEXT: v_mov_b32_e32 v1, s1 6588; GFX7-NEXT: v_mov_b32_e32 v3, s3 6589; GFX7-NEXT: s_waitcnt vmcnt(0) 6590; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 6591; GFX7-NEXT: s_waitcnt vmcnt(0) 6592; GFX7-NEXT: buffer_wbinvl1_vol 6593; GFX7-NEXT: s_endpgm 6594; 6595; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: 6596; GFX10-WGP: ; %bb.0: ; %entry 6597; GFX10-WGP-NEXT: s_clause 0x1 6598; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6599; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6600; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 6601; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 6602; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 6603; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 6604; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 6605; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 6606; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 6607; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 6608; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 6609; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 6610; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 6611; GFX10-WGP-NEXT: buffer_gl0_inv 6612; GFX10-WGP-NEXT: buffer_gl1_inv 6613; GFX10-WGP-NEXT: s_endpgm 6614; 6615; GFX10-CU-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: 6616; GFX10-CU: ; %bb.0: ; %entry 6617; GFX10-CU-NEXT: s_clause 0x1 6618; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6619; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6620; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 6621; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 6622; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 6623; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 6624; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 6625; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 6626; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 6627; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 6628; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 6629; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 6630; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 6631; GFX10-CU-NEXT: buffer_gl0_inv 6632; GFX10-CU-NEXT: buffer_gl1_inv 6633; GFX10-CU-NEXT: s_endpgm 6634; 6635; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: 6636; SKIP-CACHE-INV: ; %bb.0: ; %entry 6637; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 6638; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 6639; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 6640; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 6641; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 6642; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 6643; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 6644; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 6645; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 6646; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 6647; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 6648; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 6649; SKIP-CACHE-INV-NEXT: s_endpgm 6650; 6651; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: 6652; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 6653; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6654; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6655; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6656; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 6657; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 6658; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 6659; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 6660; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 6661; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 6662; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 6663; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol 6664; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 6665; 6666; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: 6667; GFX90A-TGSPLIT: ; %bb.0: ; %entry 6668; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6669; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6670; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6671; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 6672; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 6673; GFX90A-TGSPLIT-NEXT: buffer_wbl2 6674; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 6675; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 6676; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 6677; GFX90A-TGSPLIT-NEXT: buffer_invl2 6678; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 6679; GFX90A-TGSPLIT-NEXT: s_endpgm 6680 i32* %out, i32 %in, i32 %old) { 6681entry: 6682 %gep = getelementptr i32, i32* %out, i32 4 6683 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel monotonic 6684 ret void 6685} 6686 6687define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( 6688; GFX7-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: 6689; GFX7: ; %bb.0: ; %entry 6690; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6691; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 6692; GFX7-NEXT: s_waitcnt lgkmcnt(0) 6693; GFX7-NEXT: s_add_u32 s0, s0, 16 6694; GFX7-NEXT: s_addc_u32 s1, s1, 0 6695; GFX7-NEXT: v_mov_b32_e32 v0, s0 6696; GFX7-NEXT: v_mov_b32_e32 v2, s2 6697; GFX7-NEXT: v_mov_b32_e32 v1, s1 6698; GFX7-NEXT: v_mov_b32_e32 v3, s3 6699; GFX7-NEXT: s_waitcnt vmcnt(0) 6700; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 6701; GFX7-NEXT: s_waitcnt vmcnt(0) 6702; GFX7-NEXT: buffer_wbinvl1_vol 6703; GFX7-NEXT: s_endpgm 6704; 6705; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: 6706; GFX10-WGP: ; %bb.0: ; %entry 6707; GFX10-WGP-NEXT: s_clause 0x1 6708; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6709; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6710; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 6711; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 6712; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 6713; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 6714; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 6715; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 6716; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 6717; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 6718; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 6719; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 6720; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 6721; GFX10-WGP-NEXT: buffer_gl0_inv 6722; GFX10-WGP-NEXT: buffer_gl1_inv 6723; GFX10-WGP-NEXT: s_endpgm 6724; 6725; GFX10-CU-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: 6726; GFX10-CU: ; %bb.0: ; %entry 6727; GFX10-CU-NEXT: s_clause 0x1 6728; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6729; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6730; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 6731; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 6732; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 6733; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 6734; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 6735; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 6736; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 6737; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 6738; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 6739; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 6740; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 6741; GFX10-CU-NEXT: buffer_gl0_inv 6742; GFX10-CU-NEXT: buffer_gl1_inv 6743; GFX10-CU-NEXT: s_endpgm 6744; 6745; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: 6746; SKIP-CACHE-INV: ; %bb.0: ; %entry 6747; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 6748; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 6749; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 6750; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 6751; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 6752; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 6753; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 6754; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 6755; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 6756; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 6757; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 6758; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 6759; SKIP-CACHE-INV-NEXT: s_endpgm 6760; 6761; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: 6762; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 6763; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6764; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6765; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6766; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 6767; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 6768; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 6769; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 6770; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 6771; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 6772; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 6773; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol 6774; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 6775; 6776; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: 6777; GFX90A-TGSPLIT: ; %bb.0: ; %entry 6778; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6779; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6780; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6781; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 6782; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 6783; GFX90A-TGSPLIT-NEXT: buffer_wbl2 6784; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 6785; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 6786; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 6787; GFX90A-TGSPLIT-NEXT: buffer_invl2 6788; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 6789; GFX90A-TGSPLIT-NEXT: s_endpgm 6790 i32* %out, i32 %in, i32 %old) { 6791entry: 6792 %gep = getelementptr i32, i32* %out, i32 4 6793 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst monotonic 6794 ret void 6795} 6796 6797define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg( 6798; GFX7-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: 6799; GFX7: ; %bb.0: ; %entry 6800; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6801; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 6802; GFX7-NEXT: s_waitcnt lgkmcnt(0) 6803; GFX7-NEXT: s_add_u32 s0, s0, 16 6804; GFX7-NEXT: s_addc_u32 s1, s1, 0 6805; GFX7-NEXT: v_mov_b32_e32 v0, s0 6806; GFX7-NEXT: v_mov_b32_e32 v2, s2 6807; GFX7-NEXT: v_mov_b32_e32 v1, s1 6808; GFX7-NEXT: v_mov_b32_e32 v3, s3 6809; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 6810; GFX7-NEXT: s_waitcnt vmcnt(0) 6811; GFX7-NEXT: buffer_wbinvl1_vol 6812; GFX7-NEXT: s_endpgm 6813; 6814; GFX10-WGP-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: 6815; GFX10-WGP: ; %bb.0: ; %entry 6816; GFX10-WGP-NEXT: s_clause 0x1 6817; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6818; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6819; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 6820; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 6821; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 6822; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 6823; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 6824; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 6825; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 6826; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 6827; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 6828; GFX10-WGP-NEXT: buffer_gl0_inv 6829; GFX10-WGP-NEXT: buffer_gl1_inv 6830; GFX10-WGP-NEXT: s_endpgm 6831; 6832; GFX10-CU-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: 6833; GFX10-CU: ; %bb.0: ; %entry 6834; GFX10-CU-NEXT: s_clause 0x1 6835; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6836; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6837; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 6838; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 6839; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 6840; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 6841; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 6842; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 6843; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 6844; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 6845; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 6846; GFX10-CU-NEXT: buffer_gl0_inv 6847; GFX10-CU-NEXT: buffer_gl1_inv 6848; GFX10-CU-NEXT: s_endpgm 6849; 6850; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: 6851; SKIP-CACHE-INV: ; %bb.0: ; %entry 6852; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 6853; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 6854; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 6855; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 6856; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 6857; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 6858; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 6859; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 6860; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 6861; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 6862; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 6863; SKIP-CACHE-INV-NEXT: s_endpgm 6864; 6865; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: 6866; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 6867; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6868; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6869; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6870; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 6871; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 6872; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 6873; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 6874; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 6875; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol 6876; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 6877; 6878; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: 6879; GFX90A-TGSPLIT: ; %bb.0: ; %entry 6880; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6881; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6882; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6883; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 6884; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 6885; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 6886; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 6887; GFX90A-TGSPLIT-NEXT: buffer_invl2 6888; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 6889; GFX90A-TGSPLIT-NEXT: s_endpgm 6890 i32* %out, i32 %in, i32 %old) { 6891entry: 6892 %gep = getelementptr i32, i32* %out, i32 4 6893 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") monotonic acquire 6894 ret void 6895} 6896 6897define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( 6898; GFX7-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: 6899; GFX7: ; %bb.0: ; %entry 6900; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6901; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 6902; GFX7-NEXT: s_waitcnt lgkmcnt(0) 6903; GFX7-NEXT: s_add_u32 s0, s0, 16 6904; GFX7-NEXT: s_addc_u32 s1, s1, 0 6905; GFX7-NEXT: v_mov_b32_e32 v0, s0 6906; GFX7-NEXT: v_mov_b32_e32 v2, s2 6907; GFX7-NEXT: v_mov_b32_e32 v1, s1 6908; GFX7-NEXT: v_mov_b32_e32 v3, s3 6909; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 6910; GFX7-NEXT: s_waitcnt vmcnt(0) 6911; GFX7-NEXT: buffer_wbinvl1_vol 6912; GFX7-NEXT: s_endpgm 6913; 6914; GFX10-WGP-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: 6915; GFX10-WGP: ; %bb.0: ; %entry 6916; GFX10-WGP-NEXT: s_clause 0x1 6917; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6918; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6919; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 6920; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 6921; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 6922; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 6923; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 6924; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 6925; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 6926; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 6927; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 6928; GFX10-WGP-NEXT: buffer_gl0_inv 6929; GFX10-WGP-NEXT: buffer_gl1_inv 6930; GFX10-WGP-NEXT: s_endpgm 6931; 6932; GFX10-CU-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: 6933; GFX10-CU: ; %bb.0: ; %entry 6934; GFX10-CU-NEXT: s_clause 0x1 6935; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6936; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6937; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 6938; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 6939; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 6940; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 6941; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 6942; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 6943; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 6944; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 6945; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 6946; GFX10-CU-NEXT: buffer_gl0_inv 6947; GFX10-CU-NEXT: buffer_gl1_inv 6948; GFX10-CU-NEXT: s_endpgm 6949; 6950; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: 6951; SKIP-CACHE-INV: ; %bb.0: ; %entry 6952; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 6953; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 6954; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 6955; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 6956; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 6957; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 6958; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 6959; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 6960; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 6961; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 6962; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 6963; SKIP-CACHE-INV-NEXT: s_endpgm 6964; 6965; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: 6966; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 6967; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6968; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6969; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6970; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 6971; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 6972; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 6973; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 6974; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 6975; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol 6976; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 6977; 6978; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: 6979; GFX90A-TGSPLIT: ; %bb.0: ; %entry 6980; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6981; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6982; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6983; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 6984; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 6985; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 6986; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 6987; GFX90A-TGSPLIT-NEXT: buffer_invl2 6988; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 6989; GFX90A-TGSPLIT-NEXT: s_endpgm 6990 i32* %out, i32 %in, i32 %old) { 6991entry: 6992 %gep = getelementptr i32, i32* %out, i32 4 6993 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acquire acquire 6994 ret void 6995} 6996 6997define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( 6998; GFX7-LABEL: flat_system_one_as_release_acquire_cmpxchg: 6999; GFX7: ; %bb.0: ; %entry 7000; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7001; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 7002; GFX7-NEXT: s_waitcnt lgkmcnt(0) 7003; GFX7-NEXT: s_add_u32 s0, s0, 16 7004; GFX7-NEXT: s_addc_u32 s1, s1, 0 7005; GFX7-NEXT: v_mov_b32_e32 v0, s0 7006; GFX7-NEXT: v_mov_b32_e32 v2, s2 7007; GFX7-NEXT: v_mov_b32_e32 v1, s1 7008; GFX7-NEXT: v_mov_b32_e32 v3, s3 7009; GFX7-NEXT: s_waitcnt vmcnt(0) 7010; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 7011; GFX7-NEXT: s_waitcnt vmcnt(0) 7012; GFX7-NEXT: buffer_wbinvl1_vol 7013; GFX7-NEXT: s_endpgm 7014; 7015; GFX10-WGP-LABEL: flat_system_one_as_release_acquire_cmpxchg: 7016; GFX10-WGP: ; %bb.0: ; %entry 7017; GFX10-WGP-NEXT: s_clause 0x1 7018; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7019; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 7020; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 7021; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 7022; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 7023; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 7024; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 7025; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 7026; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 7027; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 7028; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 7029; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 7030; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 7031; GFX10-WGP-NEXT: buffer_gl0_inv 7032; GFX10-WGP-NEXT: buffer_gl1_inv 7033; GFX10-WGP-NEXT: s_endpgm 7034; 7035; GFX10-CU-LABEL: flat_system_one_as_release_acquire_cmpxchg: 7036; GFX10-CU: ; %bb.0: ; %entry 7037; GFX10-CU-NEXT: s_clause 0x1 7038; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7039; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 7040; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 7041; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 7042; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 7043; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 7044; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 7045; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 7046; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 7047; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 7048; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 7049; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 7050; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 7051; GFX10-CU-NEXT: buffer_gl0_inv 7052; GFX10-CU-NEXT: buffer_gl1_inv 7053; GFX10-CU-NEXT: s_endpgm 7054; 7055; SKIP-CACHE-INV-LABEL: flat_system_one_as_release_acquire_cmpxchg: 7056; SKIP-CACHE-INV: ; %bb.0: ; %entry 7057; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 7058; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 7059; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 7060; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 7061; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 7062; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 7063; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 7064; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 7065; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 7066; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 7067; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 7068; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 7069; SKIP-CACHE-INV-NEXT: s_endpgm 7070; 7071; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_acquire_cmpxchg: 7072; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 7073; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7074; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 7075; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7076; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 7077; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 7078; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 7079; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 7080; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 7081; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 7082; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 7083; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol 7084; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 7085; 7086; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_acquire_cmpxchg: 7087; GFX90A-TGSPLIT: ; %bb.0: ; %entry 7088; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7089; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 7090; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7091; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 7092; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 7093; GFX90A-TGSPLIT-NEXT: buffer_wbl2 7094; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 7095; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 7096; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 7097; GFX90A-TGSPLIT-NEXT: buffer_invl2 7098; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 7099; GFX90A-TGSPLIT-NEXT: s_endpgm 7100 i32* %out, i32 %in, i32 %old) { 7101entry: 7102 %gep = getelementptr i32, i32* %out, i32 4 7103 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") release acquire 7104 ret void 7105} 7106 7107define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( 7108; GFX7-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: 7109; GFX7: ; %bb.0: ; %entry 7110; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7111; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 7112; GFX7-NEXT: s_waitcnt lgkmcnt(0) 7113; GFX7-NEXT: s_add_u32 s0, s0, 16 7114; GFX7-NEXT: s_addc_u32 s1, s1, 0 7115; GFX7-NEXT: v_mov_b32_e32 v0, s0 7116; GFX7-NEXT: v_mov_b32_e32 v2, s2 7117; GFX7-NEXT: v_mov_b32_e32 v1, s1 7118; GFX7-NEXT: v_mov_b32_e32 v3, s3 7119; GFX7-NEXT: s_waitcnt vmcnt(0) 7120; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 7121; GFX7-NEXT: s_waitcnt vmcnt(0) 7122; GFX7-NEXT: buffer_wbinvl1_vol 7123; GFX7-NEXT: s_endpgm 7124; 7125; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: 7126; GFX10-WGP: ; %bb.0: ; %entry 7127; GFX10-WGP-NEXT: s_clause 0x1 7128; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7129; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 7130; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 7131; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 7132; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 7133; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 7134; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 7135; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 7136; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 7137; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 7138; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 7139; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 7140; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 7141; GFX10-WGP-NEXT: buffer_gl0_inv 7142; GFX10-WGP-NEXT: buffer_gl1_inv 7143; GFX10-WGP-NEXT: s_endpgm 7144; 7145; GFX10-CU-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: 7146; GFX10-CU: ; %bb.0: ; %entry 7147; GFX10-CU-NEXT: s_clause 0x1 7148; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7149; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 7150; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 7151; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 7152; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 7153; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 7154; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 7155; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 7156; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 7157; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 7158; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 7159; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 7160; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 7161; GFX10-CU-NEXT: buffer_gl0_inv 7162; GFX10-CU-NEXT: buffer_gl1_inv 7163; GFX10-CU-NEXT: s_endpgm 7164; 7165; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: 7166; SKIP-CACHE-INV: ; %bb.0: ; %entry 7167; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 7168; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 7169; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 7170; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 7171; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 7172; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 7173; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 7174; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 7175; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 7176; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 7177; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 7178; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 7179; SKIP-CACHE-INV-NEXT: s_endpgm 7180; 7181; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: 7182; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 7183; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7184; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 7185; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7186; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 7187; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 7188; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 7189; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 7190; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 7191; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 7192; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 7193; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol 7194; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 7195; 7196; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: 7197; GFX90A-TGSPLIT: ; %bb.0: ; %entry 7198; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7199; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 7200; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7201; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 7202; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 7203; GFX90A-TGSPLIT-NEXT: buffer_wbl2 7204; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 7205; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 7206; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 7207; GFX90A-TGSPLIT-NEXT: buffer_invl2 7208; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 7209; GFX90A-TGSPLIT-NEXT: s_endpgm 7210 i32* %out, i32 %in, i32 %old) { 7211entry: 7212 %gep = getelementptr i32, i32* %out, i32 4 7213 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel acquire 7214 ret void 7215} 7216 7217define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( 7218; GFX7-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: 7219; GFX7: ; %bb.0: ; %entry 7220; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7221; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 7222; GFX7-NEXT: s_waitcnt lgkmcnt(0) 7223; GFX7-NEXT: s_add_u32 s0, s0, 16 7224; GFX7-NEXT: s_addc_u32 s1, s1, 0 7225; GFX7-NEXT: v_mov_b32_e32 v0, s0 7226; GFX7-NEXT: v_mov_b32_e32 v2, s2 7227; GFX7-NEXT: v_mov_b32_e32 v1, s1 7228; GFX7-NEXT: v_mov_b32_e32 v3, s3 7229; GFX7-NEXT: s_waitcnt vmcnt(0) 7230; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 7231; GFX7-NEXT: s_waitcnt vmcnt(0) 7232; GFX7-NEXT: buffer_wbinvl1_vol 7233; GFX7-NEXT: s_endpgm 7234; 7235; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: 7236; GFX10-WGP: ; %bb.0: ; %entry 7237; GFX10-WGP-NEXT: s_clause 0x1 7238; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7239; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 7240; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 7241; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 7242; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 7243; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 7244; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 7245; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 7246; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 7247; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 7248; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 7249; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 7250; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 7251; GFX10-WGP-NEXT: buffer_gl0_inv 7252; GFX10-WGP-NEXT: buffer_gl1_inv 7253; GFX10-WGP-NEXT: s_endpgm 7254; 7255; GFX10-CU-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: 7256; GFX10-CU: ; %bb.0: ; %entry 7257; GFX10-CU-NEXT: s_clause 0x1 7258; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7259; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 7260; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 7261; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 7262; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 7263; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 7264; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 7265; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 7266; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 7267; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 7268; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 7269; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 7270; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 7271; GFX10-CU-NEXT: buffer_gl0_inv 7272; GFX10-CU-NEXT: buffer_gl1_inv 7273; GFX10-CU-NEXT: s_endpgm 7274; 7275; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: 7276; SKIP-CACHE-INV: ; %bb.0: ; %entry 7277; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 7278; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 7279; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 7280; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 7281; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 7282; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 7283; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 7284; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 7285; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 7286; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 7287; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 7288; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 7289; SKIP-CACHE-INV-NEXT: s_endpgm 7290; 7291; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: 7292; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 7293; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7294; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 7295; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7296; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 7297; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 7298; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 7299; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 7300; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 7301; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 7302; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 7303; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol 7304; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 7305; 7306; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: 7307; GFX90A-TGSPLIT: ; %bb.0: ; %entry 7308; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7309; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 7310; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7311; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 7312; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 7313; GFX90A-TGSPLIT-NEXT: buffer_wbl2 7314; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 7315; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 7316; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 7317; GFX90A-TGSPLIT-NEXT: buffer_invl2 7318; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 7319; GFX90A-TGSPLIT-NEXT: s_endpgm 7320 i32* %out, i32 %in, i32 %old) { 7321entry: 7322 %gep = getelementptr i32, i32* %out, i32 4 7323 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst acquire 7324 ret void 7325} 7326 7327define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg( 7328; GFX7-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: 7329; GFX7: ; %bb.0: ; %entry 7330; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7331; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 7332; GFX7-NEXT: s_waitcnt lgkmcnt(0) 7333; GFX7-NEXT: s_add_u32 s0, s0, 16 7334; GFX7-NEXT: s_addc_u32 s1, s1, 0 7335; GFX7-NEXT: v_mov_b32_e32 v0, s0 7336; GFX7-NEXT: v_mov_b32_e32 v2, s2 7337; GFX7-NEXT: v_mov_b32_e32 v1, s1 7338; GFX7-NEXT: v_mov_b32_e32 v3, s3 7339; GFX7-NEXT: s_waitcnt vmcnt(0) 7340; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 7341; GFX7-NEXT: s_waitcnt vmcnt(0) 7342; GFX7-NEXT: buffer_wbinvl1_vol 7343; GFX7-NEXT: s_endpgm 7344; 7345; GFX10-WGP-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: 7346; GFX10-WGP: ; %bb.0: ; %entry 7347; GFX10-WGP-NEXT: s_clause 0x1 7348; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7349; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 7350; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 7351; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 7352; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 7353; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 7354; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 7355; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 7356; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 7357; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 7358; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 7359; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 7360; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 7361; GFX10-WGP-NEXT: buffer_gl0_inv 7362; GFX10-WGP-NEXT: buffer_gl1_inv 7363; GFX10-WGP-NEXT: s_endpgm 7364; 7365; GFX10-CU-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: 7366; GFX10-CU: ; %bb.0: ; %entry 7367; GFX10-CU-NEXT: s_clause 0x1 7368; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7369; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 7370; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 7371; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 7372; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 7373; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 7374; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 7375; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 7376; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 7377; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 7378; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 7379; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 7380; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 7381; GFX10-CU-NEXT: buffer_gl0_inv 7382; GFX10-CU-NEXT: buffer_gl1_inv 7383; GFX10-CU-NEXT: s_endpgm 7384; 7385; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: 7386; SKIP-CACHE-INV: ; %bb.0: ; %entry 7387; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 7388; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 7389; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 7390; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 7391; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 7392; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 7393; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 7394; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 7395; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 7396; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 7397; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 7398; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 7399; SKIP-CACHE-INV-NEXT: s_endpgm 7400; 7401; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: 7402; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 7403; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7404; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 7405; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7406; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 7407; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 7408; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 7409; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 7410; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 7411; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 7412; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 7413; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol 7414; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 7415; 7416; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: 7417; GFX90A-TGSPLIT: ; %bb.0: ; %entry 7418; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7419; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 7420; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7421; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 7422; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 7423; GFX90A-TGSPLIT-NEXT: buffer_wbl2 7424; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 7425; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 7426; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 7427; GFX90A-TGSPLIT-NEXT: buffer_invl2 7428; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 7429; GFX90A-TGSPLIT-NEXT: s_endpgm 7430 i32* %out, i32 %in, i32 %old) { 7431entry: 7432 %gep = getelementptr i32, i32* %out, i32 4 7433 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") monotonic seq_cst 7434 ret void 7435} 7436 7437define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg( 7438; GFX7-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: 7439; GFX7: ; %bb.0: ; %entry 7440; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7441; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 7442; GFX7-NEXT: s_waitcnt lgkmcnt(0) 7443; GFX7-NEXT: s_add_u32 s0, s0, 16 7444; GFX7-NEXT: s_addc_u32 s1, s1, 0 7445; GFX7-NEXT: v_mov_b32_e32 v0, s0 7446; GFX7-NEXT: v_mov_b32_e32 v2, s2 7447; GFX7-NEXT: v_mov_b32_e32 v1, s1 7448; GFX7-NEXT: v_mov_b32_e32 v3, s3 7449; GFX7-NEXT: s_waitcnt vmcnt(0) 7450; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 7451; GFX7-NEXT: s_waitcnt vmcnt(0) 7452; GFX7-NEXT: buffer_wbinvl1_vol 7453; GFX7-NEXT: s_endpgm 7454; 7455; GFX10-WGP-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: 7456; GFX10-WGP: ; %bb.0: ; %entry 7457; GFX10-WGP-NEXT: s_clause 0x1 7458; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7459; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 7460; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 7461; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 7462; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 7463; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 7464; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 7465; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 7466; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 7467; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 7468; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 7469; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 7470; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 7471; GFX10-WGP-NEXT: buffer_gl0_inv 7472; GFX10-WGP-NEXT: buffer_gl1_inv 7473; GFX10-WGP-NEXT: s_endpgm 7474; 7475; GFX10-CU-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: 7476; GFX10-CU: ; %bb.0: ; %entry 7477; GFX10-CU-NEXT: s_clause 0x1 7478; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7479; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 7480; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 7481; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 7482; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 7483; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 7484; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 7485; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 7486; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 7487; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 7488; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 7489; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 7490; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 7491; GFX10-CU-NEXT: buffer_gl0_inv 7492; GFX10-CU-NEXT: buffer_gl1_inv 7493; GFX10-CU-NEXT: s_endpgm 7494; 7495; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: 7496; SKIP-CACHE-INV: ; %bb.0: ; %entry 7497; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 7498; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 7499; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 7500; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 7501; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 7502; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 7503; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 7504; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 7505; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 7506; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 7507; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 7508; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 7509; SKIP-CACHE-INV-NEXT: s_endpgm 7510; 7511; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: 7512; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 7513; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7514; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 7515; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7516; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 7517; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 7518; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 7519; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 7520; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 7521; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 7522; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 7523; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol 7524; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 7525; 7526; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: 7527; GFX90A-TGSPLIT: ; %bb.0: ; %entry 7528; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7529; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 7530; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7531; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 7532; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 7533; GFX90A-TGSPLIT-NEXT: buffer_wbl2 7534; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 7535; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 7536; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 7537; GFX90A-TGSPLIT-NEXT: buffer_invl2 7538; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 7539; GFX90A-TGSPLIT-NEXT: s_endpgm 7540 i32* %out, i32 %in, i32 %old) { 7541entry: 7542 %gep = getelementptr i32, i32* %out, i32 4 7543 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acquire seq_cst 7544 ret void 7545} 7546 7547define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( 7548; GFX7-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: 7549; GFX7: ; %bb.0: ; %entry 7550; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7551; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 7552; GFX7-NEXT: s_waitcnt lgkmcnt(0) 7553; GFX7-NEXT: s_add_u32 s0, s0, 16 7554; GFX7-NEXT: s_addc_u32 s1, s1, 0 7555; GFX7-NEXT: v_mov_b32_e32 v0, s0 7556; GFX7-NEXT: v_mov_b32_e32 v2, s2 7557; GFX7-NEXT: v_mov_b32_e32 v1, s1 7558; GFX7-NEXT: v_mov_b32_e32 v3, s3 7559; GFX7-NEXT: s_waitcnt vmcnt(0) 7560; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 7561; GFX7-NEXT: s_waitcnt vmcnt(0) 7562; GFX7-NEXT: buffer_wbinvl1_vol 7563; GFX7-NEXT: s_endpgm 7564; 7565; GFX10-WGP-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: 7566; GFX10-WGP: ; %bb.0: ; %entry 7567; GFX10-WGP-NEXT: s_clause 0x1 7568; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7569; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 7570; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 7571; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 7572; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 7573; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 7574; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 7575; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 7576; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 7577; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 7578; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 7579; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 7580; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 7581; GFX10-WGP-NEXT: buffer_gl0_inv 7582; GFX10-WGP-NEXT: buffer_gl1_inv 7583; GFX10-WGP-NEXT: s_endpgm 7584; 7585; GFX10-CU-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: 7586; GFX10-CU: ; %bb.0: ; %entry 7587; GFX10-CU-NEXT: s_clause 0x1 7588; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7589; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 7590; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 7591; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 7592; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 7593; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 7594; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 7595; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 7596; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 7597; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 7598; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 7599; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 7600; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 7601; GFX10-CU-NEXT: buffer_gl0_inv 7602; GFX10-CU-NEXT: buffer_gl1_inv 7603; GFX10-CU-NEXT: s_endpgm 7604; 7605; SKIP-CACHE-INV-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: 7606; SKIP-CACHE-INV: ; %bb.0: ; %entry 7607; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 7608; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 7609; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 7610; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 7611; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 7612; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 7613; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 7614; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 7615; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 7616; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 7617; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 7618; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 7619; SKIP-CACHE-INV-NEXT: s_endpgm 7620; 7621; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: 7622; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 7623; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7624; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 7625; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7626; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 7627; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 7628; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 7629; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 7630; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 7631; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 7632; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 7633; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol 7634; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 7635; 7636; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: 7637; GFX90A-TGSPLIT: ; %bb.0: ; %entry 7638; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7639; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 7640; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7641; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 7642; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 7643; GFX90A-TGSPLIT-NEXT: buffer_wbl2 7644; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 7645; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 7646; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 7647; GFX90A-TGSPLIT-NEXT: buffer_invl2 7648; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 7649; GFX90A-TGSPLIT-NEXT: s_endpgm 7650 i32* %out, i32 %in, i32 %old) { 7651entry: 7652 %gep = getelementptr i32, i32* %out, i32 4 7653 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") release seq_cst 7654 ret void 7655} 7656 7657define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( 7658; GFX7-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: 7659; GFX7: ; %bb.0: ; %entry 7660; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7661; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 7662; GFX7-NEXT: s_waitcnt lgkmcnt(0) 7663; GFX7-NEXT: s_add_u32 s0, s0, 16 7664; GFX7-NEXT: s_addc_u32 s1, s1, 0 7665; GFX7-NEXT: v_mov_b32_e32 v0, s0 7666; GFX7-NEXT: v_mov_b32_e32 v2, s2 7667; GFX7-NEXT: v_mov_b32_e32 v1, s1 7668; GFX7-NEXT: v_mov_b32_e32 v3, s3 7669; GFX7-NEXT: s_waitcnt vmcnt(0) 7670; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 7671; GFX7-NEXT: s_waitcnt vmcnt(0) 7672; GFX7-NEXT: buffer_wbinvl1_vol 7673; GFX7-NEXT: s_endpgm 7674; 7675; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: 7676; GFX10-WGP: ; %bb.0: ; %entry 7677; GFX10-WGP-NEXT: s_clause 0x1 7678; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7679; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 7680; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 7681; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 7682; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 7683; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 7684; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 7685; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 7686; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 7687; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 7688; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 7689; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 7690; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 7691; GFX10-WGP-NEXT: buffer_gl0_inv 7692; GFX10-WGP-NEXT: buffer_gl1_inv 7693; GFX10-WGP-NEXT: s_endpgm 7694; 7695; GFX10-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: 7696; GFX10-CU: ; %bb.0: ; %entry 7697; GFX10-CU-NEXT: s_clause 0x1 7698; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7699; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 7700; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 7701; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 7702; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 7703; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 7704; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 7705; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 7706; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 7707; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 7708; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 7709; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 7710; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 7711; GFX10-CU-NEXT: buffer_gl0_inv 7712; GFX10-CU-NEXT: buffer_gl1_inv 7713; GFX10-CU-NEXT: s_endpgm 7714; 7715; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: 7716; SKIP-CACHE-INV: ; %bb.0: ; %entry 7717; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 7718; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 7719; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 7720; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 7721; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 7722; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 7723; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 7724; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 7725; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 7726; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 7727; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 7728; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 7729; SKIP-CACHE-INV-NEXT: s_endpgm 7730; 7731; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: 7732; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 7733; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7734; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 7735; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7736; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 7737; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 7738; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 7739; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 7740; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 7741; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 7742; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 7743; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol 7744; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 7745; 7746; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: 7747; GFX90A-TGSPLIT: ; %bb.0: ; %entry 7748; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7749; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 7750; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7751; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 7752; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 7753; GFX90A-TGSPLIT-NEXT: buffer_wbl2 7754; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 7755; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 7756; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 7757; GFX90A-TGSPLIT-NEXT: buffer_invl2 7758; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 7759; GFX90A-TGSPLIT-NEXT: s_endpgm 7760 i32* %out, i32 %in, i32 %old) { 7761entry: 7762 %gep = getelementptr i32, i32* %out, i32 4 7763 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel seq_cst 7764 ret void 7765} 7766 7767define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( 7768; GFX7-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: 7769; GFX7: ; %bb.0: ; %entry 7770; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7771; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 7772; GFX7-NEXT: s_waitcnt lgkmcnt(0) 7773; GFX7-NEXT: s_add_u32 s0, s0, 16 7774; GFX7-NEXT: s_addc_u32 s1, s1, 0 7775; GFX7-NEXT: v_mov_b32_e32 v0, s0 7776; GFX7-NEXT: v_mov_b32_e32 v2, s2 7777; GFX7-NEXT: v_mov_b32_e32 v1, s1 7778; GFX7-NEXT: v_mov_b32_e32 v3, s3 7779; GFX7-NEXT: s_waitcnt vmcnt(0) 7780; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 7781; GFX7-NEXT: s_waitcnt vmcnt(0) 7782; GFX7-NEXT: buffer_wbinvl1_vol 7783; GFX7-NEXT: s_endpgm 7784; 7785; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: 7786; GFX10-WGP: ; %bb.0: ; %entry 7787; GFX10-WGP-NEXT: s_clause 0x1 7788; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7789; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 7790; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 7791; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 7792; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 7793; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 7794; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 7795; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 7796; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 7797; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 7798; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 7799; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 7800; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 7801; GFX10-WGP-NEXT: buffer_gl0_inv 7802; GFX10-WGP-NEXT: buffer_gl1_inv 7803; GFX10-WGP-NEXT: s_endpgm 7804; 7805; GFX10-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: 7806; GFX10-CU: ; %bb.0: ; %entry 7807; GFX10-CU-NEXT: s_clause 0x1 7808; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7809; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 7810; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 7811; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 7812; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 7813; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 7814; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 7815; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 7816; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 7817; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 7818; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 7819; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 7820; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 7821; GFX10-CU-NEXT: buffer_gl0_inv 7822; GFX10-CU-NEXT: buffer_gl1_inv 7823; GFX10-CU-NEXT: s_endpgm 7824; 7825; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: 7826; SKIP-CACHE-INV: ; %bb.0: ; %entry 7827; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 7828; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 7829; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 7830; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 7831; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 7832; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 7833; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 7834; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 7835; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 7836; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 7837; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 7838; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 7839; SKIP-CACHE-INV-NEXT: s_endpgm 7840; 7841; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: 7842; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 7843; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7844; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 7845; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7846; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 7847; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 7848; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 7849; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 7850; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 7851; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 7852; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 7853; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol 7854; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 7855; 7856; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: 7857; GFX90A-TGSPLIT: ; %bb.0: ; %entry 7858; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7859; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 7860; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7861; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 7862; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 7863; GFX90A-TGSPLIT-NEXT: buffer_wbl2 7864; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 7865; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 7866; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 7867; GFX90A-TGSPLIT-NEXT: buffer_invl2 7868; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 7869; GFX90A-TGSPLIT-NEXT: s_endpgm 7870 i32* %out, i32 %in, i32 %old) { 7871entry: 7872 %gep = getelementptr i32, i32* %out, i32 4 7873 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst seq_cst 7874 ret void 7875} 7876 7877define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg( 7878; GFX7-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: 7879; GFX7: ; %bb.0: ; %entry 7880; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7881; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 7882; GFX7-NEXT: s_waitcnt lgkmcnt(0) 7883; GFX7-NEXT: s_add_u32 s4, s0, 16 7884; GFX7-NEXT: s_addc_u32 s5, s1, 0 7885; GFX7-NEXT: v_mov_b32_e32 v0, s4 7886; GFX7-NEXT: v_mov_b32_e32 v2, s2 7887; GFX7-NEXT: v_mov_b32_e32 v1, s5 7888; GFX7-NEXT: v_mov_b32_e32 v3, s3 7889; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7890; GFX7-NEXT: v_mov_b32_e32 v0, s0 7891; GFX7-NEXT: v_mov_b32_e32 v1, s1 7892; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7893; GFX7-NEXT: flat_store_dword v[0:1], v2 7894; GFX7-NEXT: s_endpgm 7895; 7896; GFX10-WGP-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: 7897; GFX10-WGP: ; %bb.0: ; %entry 7898; GFX10-WGP-NEXT: s_clause 0x1 7899; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7900; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 7901; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 7902; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 7903; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 7904; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 7905; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 7906; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 7907; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 7908; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7909; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 7910; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 7911; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7912; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 7913; GFX10-WGP-NEXT: s_endpgm 7914; 7915; GFX10-CU-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: 7916; GFX10-CU: ; %bb.0: ; %entry 7917; GFX10-CU-NEXT: s_clause 0x1 7918; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7919; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 7920; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 7921; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 7922; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 7923; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 7924; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 7925; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 7926; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 7927; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7928; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 7929; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 7930; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7931; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 7932; GFX10-CU-NEXT: s_endpgm 7933; 7934; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: 7935; SKIP-CACHE-INV: ; %bb.0: ; %entry 7936; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 7937; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 7938; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 7939; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 7940; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 7941; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 7942; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 7943; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 7944; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 7945; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7946; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 7947; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 7948; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7949; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 7950; SKIP-CACHE-INV-NEXT: s_endpgm 7951; 7952; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: 7953; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 7954; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7955; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 7956; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7957; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 7958; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 7959; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 7960; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7961; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 7962; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 7963; 7964; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: 7965; GFX90A-TGSPLIT: ; %bb.0: ; %entry 7966; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7967; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 7968; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7969; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 7970; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 7971; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 7972; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 7973; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 7974; GFX90A-TGSPLIT-NEXT: s_endpgm 7975 i32* %out, i32 %in, i32 %old) { 7976entry: 7977 %gep = getelementptr i32, i32* %out, i32 4 7978 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") monotonic monotonic 7979 %val0 = extractvalue { i32, i1 } %val, 0 7980 store i32 %val0, i32* %out, align 4 7981 ret void 7982} 7983 7984define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg( 7985; GFX7-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: 7986; GFX7: ; %bb.0: ; %entry 7987; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7988; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 7989; GFX7-NEXT: s_waitcnt lgkmcnt(0) 7990; GFX7-NEXT: s_add_u32 s4, s0, 16 7991; GFX7-NEXT: s_addc_u32 s5, s1, 0 7992; GFX7-NEXT: v_mov_b32_e32 v0, s4 7993; GFX7-NEXT: v_mov_b32_e32 v2, s2 7994; GFX7-NEXT: v_mov_b32_e32 v1, s5 7995; GFX7-NEXT: v_mov_b32_e32 v3, s3 7996; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7997; GFX7-NEXT: s_waitcnt vmcnt(0) 7998; GFX7-NEXT: buffer_wbinvl1_vol 7999; GFX7-NEXT: v_mov_b32_e32 v0, s0 8000; GFX7-NEXT: v_mov_b32_e32 v1, s1 8001; GFX7-NEXT: s_waitcnt lgkmcnt(0) 8002; GFX7-NEXT: flat_store_dword v[0:1], v2 8003; GFX7-NEXT: s_endpgm 8004; 8005; GFX10-WGP-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: 8006; GFX10-WGP: ; %bb.0: ; %entry 8007; GFX10-WGP-NEXT: s_clause 0x1 8008; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8009; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 8010; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 8011; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 8012; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 8013; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 8014; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 8015; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 8016; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 8017; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8018; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 8019; GFX10-WGP-NEXT: buffer_gl0_inv 8020; GFX10-WGP-NEXT: buffer_gl1_inv 8021; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 8022; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 8023; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 8024; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 8025; GFX10-WGP-NEXT: s_endpgm 8026; 8027; GFX10-CU-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: 8028; GFX10-CU: ; %bb.0: ; %entry 8029; GFX10-CU-NEXT: s_clause 0x1 8030; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8031; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 8032; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 8033; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 8034; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 8035; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 8036; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 8037; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 8038; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 8039; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8040; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 8041; GFX10-CU-NEXT: buffer_gl0_inv 8042; GFX10-CU-NEXT: buffer_gl1_inv 8043; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 8044; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 8045; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 8046; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 8047; GFX10-CU-NEXT: s_endpgm 8048; 8049; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: 8050; SKIP-CACHE-INV: ; %bb.0: ; %entry 8051; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 8052; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 8053; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 8054; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 8055; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 8056; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 8057; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 8058; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 8059; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 8060; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8061; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 8062; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 8063; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 8064; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 8065; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 8066; SKIP-CACHE-INV-NEXT: s_endpgm 8067; 8068; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: 8069; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 8070; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8071; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 8072; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8073; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 8074; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 8075; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 8076; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 8077; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 8078; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol 8079; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8080; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 8081; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 8082; 8083; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: 8084; GFX90A-TGSPLIT: ; %bb.0: ; %entry 8085; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8086; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 8087; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8088; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 8089; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 8090; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 8091; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 8092; GFX90A-TGSPLIT-NEXT: buffer_invl2 8093; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 8094; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 8095; GFX90A-TGSPLIT-NEXT: s_endpgm 8096 i32* %out, i32 %in, i32 %old) { 8097entry: 8098 %gep = getelementptr i32, i32* %out, i32 4 8099 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acquire monotonic 8100 %val0 = extractvalue { i32, i1 } %val, 0 8101 store i32 %val0, i32* %out, align 4 8102 ret void 8103} 8104 8105define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg( 8106; GFX7-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: 8107; GFX7: ; %bb.0: ; %entry 8108; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8109; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 8110; GFX7-NEXT: s_waitcnt lgkmcnt(0) 8111; GFX7-NEXT: s_add_u32 s4, s0, 16 8112; GFX7-NEXT: s_addc_u32 s5, s1, 0 8113; GFX7-NEXT: v_mov_b32_e32 v0, s4 8114; GFX7-NEXT: v_mov_b32_e32 v2, s2 8115; GFX7-NEXT: v_mov_b32_e32 v1, s5 8116; GFX7-NEXT: v_mov_b32_e32 v3, s3 8117; GFX7-NEXT: s_waitcnt vmcnt(0) 8118; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8119; GFX7-NEXT: v_mov_b32_e32 v0, s0 8120; GFX7-NEXT: v_mov_b32_e32 v1, s1 8121; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8122; GFX7-NEXT: flat_store_dword v[0:1], v2 8123; GFX7-NEXT: s_endpgm 8124; 8125; GFX10-WGP-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: 8126; GFX10-WGP: ; %bb.0: ; %entry 8127; GFX10-WGP-NEXT: s_clause 0x1 8128; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8129; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 8130; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 8131; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 8132; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 8133; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 8134; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 8135; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 8136; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 8137; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 8138; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 8139; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8140; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 8141; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 8142; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8143; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 8144; GFX10-WGP-NEXT: s_endpgm 8145; 8146; GFX10-CU-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: 8147; GFX10-CU: ; %bb.0: ; %entry 8148; GFX10-CU-NEXT: s_clause 0x1 8149; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8150; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 8151; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 8152; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 8153; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 8154; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 8155; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 8156; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 8157; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 8158; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 8159; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 8160; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8161; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 8162; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 8163; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8164; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 8165; GFX10-CU-NEXT: s_endpgm 8166; 8167; SKIP-CACHE-INV-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: 8168; SKIP-CACHE-INV: ; %bb.0: ; %entry 8169; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 8170; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 8171; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 8172; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 8173; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 8174; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 8175; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 8176; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 8177; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 8178; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 8179; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8180; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 8181; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 8182; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8183; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 8184; SKIP-CACHE-INV-NEXT: s_endpgm 8185; 8186; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: 8187; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 8188; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8189; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 8190; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8191; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 8192; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 8193; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 8194; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 8195; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 8196; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8197; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 8198; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 8199; 8200; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: 8201; GFX90A-TGSPLIT: ; %bb.0: ; %entry 8202; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8203; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 8204; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8205; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 8206; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 8207; GFX90A-TGSPLIT-NEXT: buffer_wbl2 8208; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 8209; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 8210; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 8211; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 8212; GFX90A-TGSPLIT-NEXT: s_endpgm 8213 i32* %out, i32 %in, i32 %old) { 8214entry: 8215 %gep = getelementptr i32, i32* %out, i32 4 8216 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") release monotonic 8217 %val0 = extractvalue { i32, i1 } %val, 0 8218 store i32 %val0, i32* %out, align 4 8219 ret void 8220} 8221 8222define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg( 8223; GFX7-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: 8224; GFX7: ; %bb.0: ; %entry 8225; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8226; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 8227; GFX7-NEXT: s_waitcnt lgkmcnt(0) 8228; GFX7-NEXT: s_add_u32 s4, s0, 16 8229; GFX7-NEXT: s_addc_u32 s5, s1, 0 8230; GFX7-NEXT: v_mov_b32_e32 v0, s4 8231; GFX7-NEXT: v_mov_b32_e32 v2, s2 8232; GFX7-NEXT: v_mov_b32_e32 v1, s5 8233; GFX7-NEXT: v_mov_b32_e32 v3, s3 8234; GFX7-NEXT: s_waitcnt vmcnt(0) 8235; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8236; GFX7-NEXT: s_waitcnt vmcnt(0) 8237; GFX7-NEXT: buffer_wbinvl1_vol 8238; GFX7-NEXT: v_mov_b32_e32 v0, s0 8239; GFX7-NEXT: v_mov_b32_e32 v1, s1 8240; GFX7-NEXT: s_waitcnt lgkmcnt(0) 8241; GFX7-NEXT: flat_store_dword v[0:1], v2 8242; GFX7-NEXT: s_endpgm 8243; 8244; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: 8245; GFX10-WGP: ; %bb.0: ; %entry 8246; GFX10-WGP-NEXT: s_clause 0x1 8247; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8248; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 8249; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 8250; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 8251; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 8252; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 8253; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 8254; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 8255; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 8256; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 8257; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 8258; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8259; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 8260; GFX10-WGP-NEXT: buffer_gl0_inv 8261; GFX10-WGP-NEXT: buffer_gl1_inv 8262; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 8263; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 8264; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 8265; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 8266; GFX10-WGP-NEXT: s_endpgm 8267; 8268; GFX10-CU-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: 8269; GFX10-CU: ; %bb.0: ; %entry 8270; GFX10-CU-NEXT: s_clause 0x1 8271; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8272; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 8273; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 8274; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 8275; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 8276; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 8277; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 8278; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 8279; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 8280; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 8281; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 8282; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8283; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 8284; GFX10-CU-NEXT: buffer_gl0_inv 8285; GFX10-CU-NEXT: buffer_gl1_inv 8286; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 8287; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 8288; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 8289; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 8290; GFX10-CU-NEXT: s_endpgm 8291; 8292; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: 8293; SKIP-CACHE-INV: ; %bb.0: ; %entry 8294; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 8295; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 8296; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 8297; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 8298; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 8299; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 8300; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 8301; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 8302; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 8303; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 8304; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8305; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 8306; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 8307; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 8308; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 8309; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 8310; SKIP-CACHE-INV-NEXT: s_endpgm 8311; 8312; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: 8313; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 8314; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8315; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 8316; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8317; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 8318; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 8319; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 8320; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 8321; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 8322; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 8323; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 8324; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol 8325; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8326; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 8327; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 8328; 8329; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: 8330; GFX90A-TGSPLIT: ; %bb.0: ; %entry 8331; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8332; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 8333; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8334; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 8335; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 8336; GFX90A-TGSPLIT-NEXT: buffer_wbl2 8337; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 8338; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 8339; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 8340; GFX90A-TGSPLIT-NEXT: buffer_invl2 8341; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 8342; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 8343; GFX90A-TGSPLIT-NEXT: s_endpgm 8344 i32* %out, i32 %in, i32 %old) { 8345entry: 8346 %gep = getelementptr i32, i32* %out, i32 4 8347 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel monotonic 8348 %val0 = extractvalue { i32, i1 } %val, 0 8349 store i32 %val0, i32* %out, align 4 8350 ret void 8351} 8352 8353define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg( 8354; GFX7-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: 8355; GFX7: ; %bb.0: ; %entry 8356; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8357; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 8358; GFX7-NEXT: s_waitcnt lgkmcnt(0) 8359; GFX7-NEXT: s_add_u32 s4, s0, 16 8360; GFX7-NEXT: s_addc_u32 s5, s1, 0 8361; GFX7-NEXT: v_mov_b32_e32 v0, s4 8362; GFX7-NEXT: v_mov_b32_e32 v2, s2 8363; GFX7-NEXT: v_mov_b32_e32 v1, s5 8364; GFX7-NEXT: v_mov_b32_e32 v3, s3 8365; GFX7-NEXT: s_waitcnt vmcnt(0) 8366; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8367; GFX7-NEXT: s_waitcnt vmcnt(0) 8368; GFX7-NEXT: buffer_wbinvl1_vol 8369; GFX7-NEXT: v_mov_b32_e32 v0, s0 8370; GFX7-NEXT: v_mov_b32_e32 v1, s1 8371; GFX7-NEXT: s_waitcnt lgkmcnt(0) 8372; GFX7-NEXT: flat_store_dword v[0:1], v2 8373; GFX7-NEXT: s_endpgm 8374; 8375; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: 8376; GFX10-WGP: ; %bb.0: ; %entry 8377; GFX10-WGP-NEXT: s_clause 0x1 8378; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8379; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 8380; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 8381; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 8382; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 8383; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 8384; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 8385; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 8386; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 8387; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 8388; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 8389; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8390; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 8391; GFX10-WGP-NEXT: buffer_gl0_inv 8392; GFX10-WGP-NEXT: buffer_gl1_inv 8393; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 8394; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 8395; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 8396; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 8397; GFX10-WGP-NEXT: s_endpgm 8398; 8399; GFX10-CU-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: 8400; GFX10-CU: ; %bb.0: ; %entry 8401; GFX10-CU-NEXT: s_clause 0x1 8402; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8403; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 8404; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 8405; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 8406; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 8407; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 8408; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 8409; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 8410; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 8411; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 8412; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 8413; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8414; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 8415; GFX10-CU-NEXT: buffer_gl0_inv 8416; GFX10-CU-NEXT: buffer_gl1_inv 8417; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 8418; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 8419; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 8420; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 8421; GFX10-CU-NEXT: s_endpgm 8422; 8423; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: 8424; SKIP-CACHE-INV: ; %bb.0: ; %entry 8425; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 8426; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 8427; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 8428; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 8429; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 8430; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 8431; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 8432; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 8433; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 8434; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 8435; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8436; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 8437; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 8438; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 8439; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 8440; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 8441; SKIP-CACHE-INV-NEXT: s_endpgm 8442; 8443; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: 8444; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 8445; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8446; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 8447; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8448; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 8449; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 8450; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 8451; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 8452; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 8453; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 8454; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 8455; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol 8456; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8457; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 8458; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 8459; 8460; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: 8461; GFX90A-TGSPLIT: ; %bb.0: ; %entry 8462; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8463; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 8464; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8465; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 8466; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 8467; GFX90A-TGSPLIT-NEXT: buffer_wbl2 8468; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 8469; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 8470; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 8471; GFX90A-TGSPLIT-NEXT: buffer_invl2 8472; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 8473; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 8474; GFX90A-TGSPLIT-NEXT: s_endpgm 8475 i32* %out, i32 %in, i32 %old) { 8476entry: 8477 %gep = getelementptr i32, i32* %out, i32 4 8478 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst monotonic 8479 %val0 = extractvalue { i32, i1 } %val, 0 8480 store i32 %val0, i32* %out, align 4 8481 ret void 8482} 8483 8484define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg( 8485; GFX7-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: 8486; GFX7: ; %bb.0: ; %entry 8487; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8488; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 8489; GFX7-NEXT: s_waitcnt lgkmcnt(0) 8490; GFX7-NEXT: s_add_u32 s4, s0, 16 8491; GFX7-NEXT: s_addc_u32 s5, s1, 0 8492; GFX7-NEXT: v_mov_b32_e32 v0, s4 8493; GFX7-NEXT: v_mov_b32_e32 v2, s2 8494; GFX7-NEXT: v_mov_b32_e32 v1, s5 8495; GFX7-NEXT: v_mov_b32_e32 v3, s3 8496; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8497; GFX7-NEXT: s_waitcnt vmcnt(0) 8498; GFX7-NEXT: buffer_wbinvl1_vol 8499; GFX7-NEXT: v_mov_b32_e32 v0, s0 8500; GFX7-NEXT: v_mov_b32_e32 v1, s1 8501; GFX7-NEXT: s_waitcnt lgkmcnt(0) 8502; GFX7-NEXT: flat_store_dword v[0:1], v2 8503; GFX7-NEXT: s_endpgm 8504; 8505; GFX10-WGP-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: 8506; GFX10-WGP: ; %bb.0: ; %entry 8507; GFX10-WGP-NEXT: s_clause 0x1 8508; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8509; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 8510; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 8511; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 8512; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 8513; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 8514; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 8515; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 8516; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 8517; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8518; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 8519; GFX10-WGP-NEXT: buffer_gl0_inv 8520; GFX10-WGP-NEXT: buffer_gl1_inv 8521; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 8522; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 8523; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 8524; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 8525; GFX10-WGP-NEXT: s_endpgm 8526; 8527; GFX10-CU-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: 8528; GFX10-CU: ; %bb.0: ; %entry 8529; GFX10-CU-NEXT: s_clause 0x1 8530; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8531; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 8532; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 8533; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 8534; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 8535; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 8536; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 8537; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 8538; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 8539; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8540; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 8541; GFX10-CU-NEXT: buffer_gl0_inv 8542; GFX10-CU-NEXT: buffer_gl1_inv 8543; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 8544; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 8545; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 8546; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 8547; GFX10-CU-NEXT: s_endpgm 8548; 8549; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: 8550; SKIP-CACHE-INV: ; %bb.0: ; %entry 8551; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 8552; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 8553; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 8554; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 8555; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 8556; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 8557; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 8558; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 8559; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 8560; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8561; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 8562; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 8563; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 8564; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 8565; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 8566; SKIP-CACHE-INV-NEXT: s_endpgm 8567; 8568; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: 8569; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 8570; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8571; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 8572; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8573; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 8574; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 8575; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 8576; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 8577; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 8578; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol 8579; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8580; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 8581; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 8582; 8583; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: 8584; GFX90A-TGSPLIT: ; %bb.0: ; %entry 8585; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8586; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 8587; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8588; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 8589; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 8590; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 8591; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 8592; GFX90A-TGSPLIT-NEXT: buffer_invl2 8593; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 8594; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 8595; GFX90A-TGSPLIT-NEXT: s_endpgm 8596 i32* %out, i32 %in, i32 %old) { 8597entry: 8598 %gep = getelementptr i32, i32* %out, i32 4 8599 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") monotonic acquire 8600 %val0 = extractvalue { i32, i1 } %val, 0 8601 store i32 %val0, i32* %out, align 4 8602 ret void 8603} 8604 8605define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg( 8606; GFX7-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: 8607; GFX7: ; %bb.0: ; %entry 8608; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8609; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 8610; GFX7-NEXT: s_waitcnt lgkmcnt(0) 8611; GFX7-NEXT: s_add_u32 s4, s0, 16 8612; GFX7-NEXT: s_addc_u32 s5, s1, 0 8613; GFX7-NEXT: v_mov_b32_e32 v0, s4 8614; GFX7-NEXT: v_mov_b32_e32 v2, s2 8615; GFX7-NEXT: v_mov_b32_e32 v1, s5 8616; GFX7-NEXT: v_mov_b32_e32 v3, s3 8617; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8618; GFX7-NEXT: s_waitcnt vmcnt(0) 8619; GFX7-NEXT: buffer_wbinvl1_vol 8620; GFX7-NEXT: v_mov_b32_e32 v0, s0 8621; GFX7-NEXT: v_mov_b32_e32 v1, s1 8622; GFX7-NEXT: s_waitcnt lgkmcnt(0) 8623; GFX7-NEXT: flat_store_dword v[0:1], v2 8624; GFX7-NEXT: s_endpgm 8625; 8626; GFX10-WGP-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: 8627; GFX10-WGP: ; %bb.0: ; %entry 8628; GFX10-WGP-NEXT: s_clause 0x1 8629; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8630; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 8631; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 8632; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 8633; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 8634; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 8635; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 8636; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 8637; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 8638; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8639; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 8640; GFX10-WGP-NEXT: buffer_gl0_inv 8641; GFX10-WGP-NEXT: buffer_gl1_inv 8642; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 8643; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 8644; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 8645; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 8646; GFX10-WGP-NEXT: s_endpgm 8647; 8648; GFX10-CU-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: 8649; GFX10-CU: ; %bb.0: ; %entry 8650; GFX10-CU-NEXT: s_clause 0x1 8651; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8652; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 8653; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 8654; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 8655; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 8656; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 8657; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 8658; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 8659; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 8660; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8661; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 8662; GFX10-CU-NEXT: buffer_gl0_inv 8663; GFX10-CU-NEXT: buffer_gl1_inv 8664; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 8665; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 8666; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 8667; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 8668; GFX10-CU-NEXT: s_endpgm 8669; 8670; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: 8671; SKIP-CACHE-INV: ; %bb.0: ; %entry 8672; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 8673; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 8674; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 8675; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 8676; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 8677; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 8678; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 8679; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 8680; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 8681; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8682; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 8683; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 8684; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 8685; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 8686; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 8687; SKIP-CACHE-INV-NEXT: s_endpgm 8688; 8689; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: 8690; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 8691; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8692; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 8693; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8694; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 8695; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 8696; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 8697; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 8698; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 8699; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol 8700; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8701; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 8702; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 8703; 8704; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: 8705; GFX90A-TGSPLIT: ; %bb.0: ; %entry 8706; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8707; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 8708; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8709; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 8710; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 8711; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 8712; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 8713; GFX90A-TGSPLIT-NEXT: buffer_invl2 8714; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 8715; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 8716; GFX90A-TGSPLIT-NEXT: s_endpgm 8717 i32* %out, i32 %in, i32 %old) { 8718entry: 8719 %gep = getelementptr i32, i32* %out, i32 4 8720 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acquire acquire 8721 %val0 = extractvalue { i32, i1 } %val, 0 8722 store i32 %val0, i32* %out, align 4 8723 ret void 8724} 8725 8726define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg( 8727; GFX7-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: 8728; GFX7: ; %bb.0: ; %entry 8729; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8730; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 8731; GFX7-NEXT: s_waitcnt lgkmcnt(0) 8732; GFX7-NEXT: s_add_u32 s4, s0, 16 8733; GFX7-NEXT: s_addc_u32 s5, s1, 0 8734; GFX7-NEXT: v_mov_b32_e32 v0, s4 8735; GFX7-NEXT: v_mov_b32_e32 v2, s2 8736; GFX7-NEXT: v_mov_b32_e32 v1, s5 8737; GFX7-NEXT: v_mov_b32_e32 v3, s3 8738; GFX7-NEXT: s_waitcnt vmcnt(0) 8739; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8740; GFX7-NEXT: s_waitcnt vmcnt(0) 8741; GFX7-NEXT: buffer_wbinvl1_vol 8742; GFX7-NEXT: v_mov_b32_e32 v0, s0 8743; GFX7-NEXT: v_mov_b32_e32 v1, s1 8744; GFX7-NEXT: s_waitcnt lgkmcnt(0) 8745; GFX7-NEXT: flat_store_dword v[0:1], v2 8746; GFX7-NEXT: s_endpgm 8747; 8748; GFX10-WGP-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: 8749; GFX10-WGP: ; %bb.0: ; %entry 8750; GFX10-WGP-NEXT: s_clause 0x1 8751; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8752; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 8753; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 8754; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 8755; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 8756; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 8757; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 8758; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 8759; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 8760; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 8761; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 8762; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8763; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 8764; GFX10-WGP-NEXT: buffer_gl0_inv 8765; GFX10-WGP-NEXT: buffer_gl1_inv 8766; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 8767; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 8768; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 8769; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 8770; GFX10-WGP-NEXT: s_endpgm 8771; 8772; GFX10-CU-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: 8773; GFX10-CU: ; %bb.0: ; %entry 8774; GFX10-CU-NEXT: s_clause 0x1 8775; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8776; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 8777; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 8778; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 8779; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 8780; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 8781; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 8782; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 8783; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 8784; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 8785; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 8786; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8787; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 8788; GFX10-CU-NEXT: buffer_gl0_inv 8789; GFX10-CU-NEXT: buffer_gl1_inv 8790; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 8791; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 8792; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 8793; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 8794; GFX10-CU-NEXT: s_endpgm 8795; 8796; SKIP-CACHE-INV-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: 8797; SKIP-CACHE-INV: ; %bb.0: ; %entry 8798; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 8799; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 8800; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 8801; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 8802; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 8803; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 8804; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 8805; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 8806; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 8807; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 8808; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8809; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 8810; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 8811; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 8812; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 8813; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 8814; SKIP-CACHE-INV-NEXT: s_endpgm 8815; 8816; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: 8817; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 8818; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8819; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 8820; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8821; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 8822; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 8823; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 8824; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 8825; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 8826; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 8827; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 8828; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol 8829; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8830; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 8831; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 8832; 8833; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: 8834; GFX90A-TGSPLIT: ; %bb.0: ; %entry 8835; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8836; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 8837; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8838; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 8839; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 8840; GFX90A-TGSPLIT-NEXT: buffer_wbl2 8841; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 8842; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 8843; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 8844; GFX90A-TGSPLIT-NEXT: buffer_invl2 8845; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 8846; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 8847; GFX90A-TGSPLIT-NEXT: s_endpgm 8848 i32* %out, i32 %in, i32 %old) { 8849entry: 8850 %gep = getelementptr i32, i32* %out, i32 4 8851 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") release acquire 8852 %val0 = extractvalue { i32, i1 } %val, 0 8853 store i32 %val0, i32* %out, align 4 8854 ret void 8855} 8856 8857define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg( 8858; GFX7-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: 8859; GFX7: ; %bb.0: ; %entry 8860; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8861; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 8862; GFX7-NEXT: s_waitcnt lgkmcnt(0) 8863; GFX7-NEXT: s_add_u32 s4, s0, 16 8864; GFX7-NEXT: s_addc_u32 s5, s1, 0 8865; GFX7-NEXT: v_mov_b32_e32 v0, s4 8866; GFX7-NEXT: v_mov_b32_e32 v2, s2 8867; GFX7-NEXT: v_mov_b32_e32 v1, s5 8868; GFX7-NEXT: v_mov_b32_e32 v3, s3 8869; GFX7-NEXT: s_waitcnt vmcnt(0) 8870; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8871; GFX7-NEXT: s_waitcnt vmcnt(0) 8872; GFX7-NEXT: buffer_wbinvl1_vol 8873; GFX7-NEXT: v_mov_b32_e32 v0, s0 8874; GFX7-NEXT: v_mov_b32_e32 v1, s1 8875; GFX7-NEXT: s_waitcnt lgkmcnt(0) 8876; GFX7-NEXT: flat_store_dword v[0:1], v2 8877; GFX7-NEXT: s_endpgm 8878; 8879; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: 8880; GFX10-WGP: ; %bb.0: ; %entry 8881; GFX10-WGP-NEXT: s_clause 0x1 8882; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8883; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 8884; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 8885; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 8886; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 8887; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 8888; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 8889; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 8890; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 8891; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 8892; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 8893; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8894; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 8895; GFX10-WGP-NEXT: buffer_gl0_inv 8896; GFX10-WGP-NEXT: buffer_gl1_inv 8897; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 8898; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 8899; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 8900; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 8901; GFX10-WGP-NEXT: s_endpgm 8902; 8903; GFX10-CU-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: 8904; GFX10-CU: ; %bb.0: ; %entry 8905; GFX10-CU-NEXT: s_clause 0x1 8906; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8907; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 8908; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 8909; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 8910; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 8911; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 8912; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 8913; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 8914; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 8915; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 8916; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 8917; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8918; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 8919; GFX10-CU-NEXT: buffer_gl0_inv 8920; GFX10-CU-NEXT: buffer_gl1_inv 8921; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 8922; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 8923; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 8924; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 8925; GFX10-CU-NEXT: s_endpgm 8926; 8927; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: 8928; SKIP-CACHE-INV: ; %bb.0: ; %entry 8929; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 8930; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 8931; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 8932; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 8933; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 8934; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 8935; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 8936; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 8937; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 8938; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 8939; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8940; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 8941; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 8942; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 8943; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 8944; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 8945; SKIP-CACHE-INV-NEXT: s_endpgm 8946; 8947; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: 8948; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 8949; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8950; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 8951; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8952; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 8953; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 8954; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 8955; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 8956; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 8957; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 8958; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 8959; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol 8960; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8961; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 8962; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 8963; 8964; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: 8965; GFX90A-TGSPLIT: ; %bb.0: ; %entry 8966; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8967; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 8968; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8969; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 8970; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 8971; GFX90A-TGSPLIT-NEXT: buffer_wbl2 8972; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 8973; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 8974; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 8975; GFX90A-TGSPLIT-NEXT: buffer_invl2 8976; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 8977; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 8978; GFX90A-TGSPLIT-NEXT: s_endpgm 8979 i32* %out, i32 %in, i32 %old) { 8980entry: 8981 %gep = getelementptr i32, i32* %out, i32 4 8982 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel acquire 8983 %val0 = extractvalue { i32, i1 } %val, 0 8984 store i32 %val0, i32* %out, align 4 8985 ret void 8986} 8987 8988define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg( 8989; GFX7-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: 8990; GFX7: ; %bb.0: ; %entry 8991; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8992; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 8993; GFX7-NEXT: s_waitcnt lgkmcnt(0) 8994; GFX7-NEXT: s_add_u32 s4, s0, 16 8995; GFX7-NEXT: s_addc_u32 s5, s1, 0 8996; GFX7-NEXT: v_mov_b32_e32 v0, s4 8997; GFX7-NEXT: v_mov_b32_e32 v2, s2 8998; GFX7-NEXT: v_mov_b32_e32 v1, s5 8999; GFX7-NEXT: v_mov_b32_e32 v3, s3 9000; GFX7-NEXT: s_waitcnt vmcnt(0) 9001; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 9002; GFX7-NEXT: s_waitcnt vmcnt(0) 9003; GFX7-NEXT: buffer_wbinvl1_vol 9004; GFX7-NEXT: v_mov_b32_e32 v0, s0 9005; GFX7-NEXT: v_mov_b32_e32 v1, s1 9006; GFX7-NEXT: s_waitcnt lgkmcnt(0) 9007; GFX7-NEXT: flat_store_dword v[0:1], v2 9008; GFX7-NEXT: s_endpgm 9009; 9010; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: 9011; GFX10-WGP: ; %bb.0: ; %entry 9012; GFX10-WGP-NEXT: s_clause 0x1 9013; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9014; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 9015; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 9016; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 9017; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 9018; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 9019; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 9020; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 9021; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 9022; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 9023; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 9024; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 9025; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 9026; GFX10-WGP-NEXT: buffer_gl0_inv 9027; GFX10-WGP-NEXT: buffer_gl1_inv 9028; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 9029; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 9030; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 9031; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 9032; GFX10-WGP-NEXT: s_endpgm 9033; 9034; GFX10-CU-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: 9035; GFX10-CU: ; %bb.0: ; %entry 9036; GFX10-CU-NEXT: s_clause 0x1 9037; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9038; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 9039; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 9040; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 9041; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 9042; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 9043; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 9044; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 9045; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 9046; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 9047; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 9048; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 9049; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 9050; GFX10-CU-NEXT: buffer_gl0_inv 9051; GFX10-CU-NEXT: buffer_gl1_inv 9052; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 9053; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 9054; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 9055; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 9056; GFX10-CU-NEXT: s_endpgm 9057; 9058; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: 9059; SKIP-CACHE-INV: ; %bb.0: ; %entry 9060; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 9061; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 9062; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 9063; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 9064; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 9065; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 9066; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 9067; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 9068; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 9069; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 9070; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 9071; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 9072; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 9073; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 9074; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 9075; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 9076; SKIP-CACHE-INV-NEXT: s_endpgm 9077; 9078; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: 9079; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 9080; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9081; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 9082; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9083; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 9084; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 9085; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 9086; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 9087; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 9088; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 9089; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 9090; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol 9091; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9092; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 9093; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 9094; 9095; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: 9096; GFX90A-TGSPLIT: ; %bb.0: ; %entry 9097; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9098; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 9099; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9100; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 9101; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 9102; GFX90A-TGSPLIT-NEXT: buffer_wbl2 9103; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 9104; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 9105; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 9106; GFX90A-TGSPLIT-NEXT: buffer_invl2 9107; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 9108; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 9109; GFX90A-TGSPLIT-NEXT: s_endpgm 9110 i32* %out, i32 %in, i32 %old) { 9111entry: 9112 %gep = getelementptr i32, i32* %out, i32 4 9113 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst acquire 9114 %val0 = extractvalue { i32, i1 } %val, 0 9115 store i32 %val0, i32* %out, align 4 9116 ret void 9117} 9118 9119define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg( 9120; GFX7-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: 9121; GFX7: ; %bb.0: ; %entry 9122; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9123; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 9124; GFX7-NEXT: s_waitcnt lgkmcnt(0) 9125; GFX7-NEXT: s_add_u32 s4, s0, 16 9126; GFX7-NEXT: s_addc_u32 s5, s1, 0 9127; GFX7-NEXT: v_mov_b32_e32 v0, s4 9128; GFX7-NEXT: v_mov_b32_e32 v2, s2 9129; GFX7-NEXT: v_mov_b32_e32 v1, s5 9130; GFX7-NEXT: v_mov_b32_e32 v3, s3 9131; GFX7-NEXT: s_waitcnt vmcnt(0) 9132; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 9133; GFX7-NEXT: s_waitcnt vmcnt(0) 9134; GFX7-NEXT: buffer_wbinvl1_vol 9135; GFX7-NEXT: v_mov_b32_e32 v0, s0 9136; GFX7-NEXT: v_mov_b32_e32 v1, s1 9137; GFX7-NEXT: s_waitcnt lgkmcnt(0) 9138; GFX7-NEXT: flat_store_dword v[0:1], v2 9139; GFX7-NEXT: s_endpgm 9140; 9141; GFX10-WGP-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: 9142; GFX10-WGP: ; %bb.0: ; %entry 9143; GFX10-WGP-NEXT: s_clause 0x1 9144; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9145; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 9146; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 9147; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 9148; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 9149; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 9150; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 9151; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 9152; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 9153; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 9154; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 9155; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 9156; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 9157; GFX10-WGP-NEXT: buffer_gl0_inv 9158; GFX10-WGP-NEXT: buffer_gl1_inv 9159; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 9160; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 9161; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 9162; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 9163; GFX10-WGP-NEXT: s_endpgm 9164; 9165; GFX10-CU-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: 9166; GFX10-CU: ; %bb.0: ; %entry 9167; GFX10-CU-NEXT: s_clause 0x1 9168; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9169; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 9170; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 9171; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 9172; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 9173; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 9174; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 9175; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 9176; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 9177; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 9178; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 9179; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 9180; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 9181; GFX10-CU-NEXT: buffer_gl0_inv 9182; GFX10-CU-NEXT: buffer_gl1_inv 9183; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 9184; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 9185; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 9186; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 9187; GFX10-CU-NEXT: s_endpgm 9188; 9189; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: 9190; SKIP-CACHE-INV: ; %bb.0: ; %entry 9191; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 9192; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 9193; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 9194; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 9195; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 9196; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 9197; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 9198; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 9199; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 9200; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 9201; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 9202; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 9203; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 9204; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 9205; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 9206; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 9207; SKIP-CACHE-INV-NEXT: s_endpgm 9208; 9209; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: 9210; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 9211; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9212; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 9213; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9214; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 9215; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 9216; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 9217; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 9218; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 9219; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 9220; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 9221; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol 9222; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9223; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 9224; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 9225; 9226; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: 9227; GFX90A-TGSPLIT: ; %bb.0: ; %entry 9228; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9229; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 9230; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9231; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 9232; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 9233; GFX90A-TGSPLIT-NEXT: buffer_wbl2 9234; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 9235; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 9236; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 9237; GFX90A-TGSPLIT-NEXT: buffer_invl2 9238; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 9239; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 9240; GFX90A-TGSPLIT-NEXT: s_endpgm 9241 i32* %out, i32 %in, i32 %old) { 9242entry: 9243 %gep = getelementptr i32, i32* %out, i32 4 9244 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") monotonic seq_cst 9245 %val0 = extractvalue { i32, i1 } %val, 0 9246 store i32 %val0, i32* %out, align 4 9247 ret void 9248} 9249 9250define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg( 9251; GFX7-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: 9252; GFX7: ; %bb.0: ; %entry 9253; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9254; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 9255; GFX7-NEXT: s_waitcnt lgkmcnt(0) 9256; GFX7-NEXT: s_add_u32 s4, s0, 16 9257; GFX7-NEXT: s_addc_u32 s5, s1, 0 9258; GFX7-NEXT: v_mov_b32_e32 v0, s4 9259; GFX7-NEXT: v_mov_b32_e32 v2, s2 9260; GFX7-NEXT: v_mov_b32_e32 v1, s5 9261; GFX7-NEXT: v_mov_b32_e32 v3, s3 9262; GFX7-NEXT: s_waitcnt vmcnt(0) 9263; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 9264; GFX7-NEXT: s_waitcnt vmcnt(0) 9265; GFX7-NEXT: buffer_wbinvl1_vol 9266; GFX7-NEXT: v_mov_b32_e32 v0, s0 9267; GFX7-NEXT: v_mov_b32_e32 v1, s1 9268; GFX7-NEXT: s_waitcnt lgkmcnt(0) 9269; GFX7-NEXT: flat_store_dword v[0:1], v2 9270; GFX7-NEXT: s_endpgm 9271; 9272; GFX10-WGP-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: 9273; GFX10-WGP: ; %bb.0: ; %entry 9274; GFX10-WGP-NEXT: s_clause 0x1 9275; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9276; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 9277; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 9278; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 9279; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 9280; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 9281; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 9282; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 9283; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 9284; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 9285; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 9286; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 9287; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 9288; GFX10-WGP-NEXT: buffer_gl0_inv 9289; GFX10-WGP-NEXT: buffer_gl1_inv 9290; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 9291; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 9292; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 9293; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 9294; GFX10-WGP-NEXT: s_endpgm 9295; 9296; GFX10-CU-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: 9297; GFX10-CU: ; %bb.0: ; %entry 9298; GFX10-CU-NEXT: s_clause 0x1 9299; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9300; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 9301; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 9302; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 9303; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 9304; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 9305; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 9306; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 9307; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 9308; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 9309; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 9310; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 9311; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 9312; GFX10-CU-NEXT: buffer_gl0_inv 9313; GFX10-CU-NEXT: buffer_gl1_inv 9314; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 9315; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 9316; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 9317; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 9318; GFX10-CU-NEXT: s_endpgm 9319; 9320; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: 9321; SKIP-CACHE-INV: ; %bb.0: ; %entry 9322; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 9323; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 9324; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 9325; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 9326; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 9327; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 9328; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 9329; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 9330; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 9331; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 9332; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 9333; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 9334; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 9335; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 9336; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 9337; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 9338; SKIP-CACHE-INV-NEXT: s_endpgm 9339; 9340; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: 9341; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 9342; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9343; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 9344; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9345; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 9346; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 9347; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 9348; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 9349; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 9350; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 9351; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 9352; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol 9353; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9354; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 9355; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 9356; 9357; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: 9358; GFX90A-TGSPLIT: ; %bb.0: ; %entry 9359; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9360; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 9361; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9362; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 9363; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 9364; GFX90A-TGSPLIT-NEXT: buffer_wbl2 9365; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 9366; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 9367; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 9368; GFX90A-TGSPLIT-NEXT: buffer_invl2 9369; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 9370; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 9371; GFX90A-TGSPLIT-NEXT: s_endpgm 9372 i32* %out, i32 %in, i32 %old) { 9373entry: 9374 %gep = getelementptr i32, i32* %out, i32 4 9375 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acquire seq_cst 9376 %val0 = extractvalue { i32, i1 } %val, 0 9377 store i32 %val0, i32* %out, align 4 9378 ret void 9379} 9380 9381define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg( 9382; GFX7-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: 9383; GFX7: ; %bb.0: ; %entry 9384; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9385; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 9386; GFX7-NEXT: s_waitcnt lgkmcnt(0) 9387; GFX7-NEXT: s_add_u32 s4, s0, 16 9388; GFX7-NEXT: s_addc_u32 s5, s1, 0 9389; GFX7-NEXT: v_mov_b32_e32 v0, s4 9390; GFX7-NEXT: v_mov_b32_e32 v2, s2 9391; GFX7-NEXT: v_mov_b32_e32 v1, s5 9392; GFX7-NEXT: v_mov_b32_e32 v3, s3 9393; GFX7-NEXT: s_waitcnt vmcnt(0) 9394; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 9395; GFX7-NEXT: s_waitcnt vmcnt(0) 9396; GFX7-NEXT: buffer_wbinvl1_vol 9397; GFX7-NEXT: v_mov_b32_e32 v0, s0 9398; GFX7-NEXT: v_mov_b32_e32 v1, s1 9399; GFX7-NEXT: s_waitcnt lgkmcnt(0) 9400; GFX7-NEXT: flat_store_dword v[0:1], v2 9401; GFX7-NEXT: s_endpgm 9402; 9403; GFX10-WGP-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: 9404; GFX10-WGP: ; %bb.0: ; %entry 9405; GFX10-WGP-NEXT: s_clause 0x1 9406; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9407; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 9408; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 9409; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 9410; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 9411; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 9412; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 9413; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 9414; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 9415; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 9416; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 9417; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 9418; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 9419; GFX10-WGP-NEXT: buffer_gl0_inv 9420; GFX10-WGP-NEXT: buffer_gl1_inv 9421; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 9422; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 9423; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 9424; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 9425; GFX10-WGP-NEXT: s_endpgm 9426; 9427; GFX10-CU-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: 9428; GFX10-CU: ; %bb.0: ; %entry 9429; GFX10-CU-NEXT: s_clause 0x1 9430; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9431; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 9432; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 9433; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 9434; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 9435; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 9436; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 9437; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 9438; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 9439; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 9440; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 9441; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 9442; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 9443; GFX10-CU-NEXT: buffer_gl0_inv 9444; GFX10-CU-NEXT: buffer_gl1_inv 9445; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 9446; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 9447; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 9448; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 9449; GFX10-CU-NEXT: s_endpgm 9450; 9451; SKIP-CACHE-INV-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: 9452; SKIP-CACHE-INV: ; %bb.0: ; %entry 9453; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 9454; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 9455; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 9456; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 9457; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 9458; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 9459; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 9460; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 9461; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 9462; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 9463; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 9464; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 9465; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 9466; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 9467; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 9468; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 9469; SKIP-CACHE-INV-NEXT: s_endpgm 9470; 9471; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: 9472; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 9473; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9474; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 9475; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9476; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 9477; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 9478; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 9479; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 9480; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 9481; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 9482; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 9483; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol 9484; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9485; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 9486; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 9487; 9488; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: 9489; GFX90A-TGSPLIT: ; %bb.0: ; %entry 9490; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9491; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 9492; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9493; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 9494; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 9495; GFX90A-TGSPLIT-NEXT: buffer_wbl2 9496; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 9497; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 9498; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 9499; GFX90A-TGSPLIT-NEXT: buffer_invl2 9500; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 9501; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 9502; GFX90A-TGSPLIT-NEXT: s_endpgm 9503 i32* %out, i32 %in, i32 %old) { 9504entry: 9505 %gep = getelementptr i32, i32* %out, i32 4 9506 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") release seq_cst 9507 %val0 = extractvalue { i32, i1 } %val, 0 9508 store i32 %val0, i32* %out, align 4 9509 ret void 9510} 9511 9512define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg( 9513; GFX7-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: 9514; GFX7: ; %bb.0: ; %entry 9515; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9516; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 9517; GFX7-NEXT: s_waitcnt lgkmcnt(0) 9518; GFX7-NEXT: s_add_u32 s4, s0, 16 9519; GFX7-NEXT: s_addc_u32 s5, s1, 0 9520; GFX7-NEXT: v_mov_b32_e32 v0, s4 9521; GFX7-NEXT: v_mov_b32_e32 v2, s2 9522; GFX7-NEXT: v_mov_b32_e32 v1, s5 9523; GFX7-NEXT: v_mov_b32_e32 v3, s3 9524; GFX7-NEXT: s_waitcnt vmcnt(0) 9525; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 9526; GFX7-NEXT: s_waitcnt vmcnt(0) 9527; GFX7-NEXT: buffer_wbinvl1_vol 9528; GFX7-NEXT: v_mov_b32_e32 v0, s0 9529; GFX7-NEXT: v_mov_b32_e32 v1, s1 9530; GFX7-NEXT: s_waitcnt lgkmcnt(0) 9531; GFX7-NEXT: flat_store_dword v[0:1], v2 9532; GFX7-NEXT: s_endpgm 9533; 9534; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: 9535; GFX10-WGP: ; %bb.0: ; %entry 9536; GFX10-WGP-NEXT: s_clause 0x1 9537; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9538; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 9539; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 9540; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 9541; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 9542; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 9543; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 9544; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 9545; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 9546; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 9547; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 9548; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 9549; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 9550; GFX10-WGP-NEXT: buffer_gl0_inv 9551; GFX10-WGP-NEXT: buffer_gl1_inv 9552; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 9553; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 9554; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 9555; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 9556; GFX10-WGP-NEXT: s_endpgm 9557; 9558; GFX10-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: 9559; GFX10-CU: ; %bb.0: ; %entry 9560; GFX10-CU-NEXT: s_clause 0x1 9561; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9562; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 9563; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 9564; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 9565; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 9566; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 9567; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 9568; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 9569; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 9570; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 9571; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 9572; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 9573; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 9574; GFX10-CU-NEXT: buffer_gl0_inv 9575; GFX10-CU-NEXT: buffer_gl1_inv 9576; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 9577; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 9578; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 9579; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 9580; GFX10-CU-NEXT: s_endpgm 9581; 9582; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: 9583; SKIP-CACHE-INV: ; %bb.0: ; %entry 9584; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 9585; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 9586; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 9587; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 9588; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 9589; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 9590; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 9591; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 9592; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 9593; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 9594; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 9595; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 9596; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 9597; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 9598; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 9599; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 9600; SKIP-CACHE-INV-NEXT: s_endpgm 9601; 9602; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: 9603; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 9604; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9605; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 9606; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9607; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 9608; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 9609; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 9610; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 9611; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 9612; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 9613; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 9614; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol 9615; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9616; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 9617; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 9618; 9619; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: 9620; GFX90A-TGSPLIT: ; %bb.0: ; %entry 9621; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9622; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 9623; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9624; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 9625; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 9626; GFX90A-TGSPLIT-NEXT: buffer_wbl2 9627; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 9628; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 9629; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 9630; GFX90A-TGSPLIT-NEXT: buffer_invl2 9631; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 9632; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 9633; GFX90A-TGSPLIT-NEXT: s_endpgm 9634 i32* %out, i32 %in, i32 %old) { 9635entry: 9636 %gep = getelementptr i32, i32* %out, i32 4 9637 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel seq_cst 9638 %val0 = extractvalue { i32, i1 } %val, 0 9639 store i32 %val0, i32* %out, align 4 9640 ret void 9641} 9642 9643define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg( 9644; GFX7-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: 9645; GFX7: ; %bb.0: ; %entry 9646; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9647; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 9648; GFX7-NEXT: s_waitcnt lgkmcnt(0) 9649; GFX7-NEXT: s_add_u32 s4, s0, 16 9650; GFX7-NEXT: s_addc_u32 s5, s1, 0 9651; GFX7-NEXT: v_mov_b32_e32 v0, s4 9652; GFX7-NEXT: v_mov_b32_e32 v2, s2 9653; GFX7-NEXT: v_mov_b32_e32 v1, s5 9654; GFX7-NEXT: v_mov_b32_e32 v3, s3 9655; GFX7-NEXT: s_waitcnt vmcnt(0) 9656; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 9657; GFX7-NEXT: s_waitcnt vmcnt(0) 9658; GFX7-NEXT: buffer_wbinvl1_vol 9659; GFX7-NEXT: v_mov_b32_e32 v0, s0 9660; GFX7-NEXT: v_mov_b32_e32 v1, s1 9661; GFX7-NEXT: s_waitcnt lgkmcnt(0) 9662; GFX7-NEXT: flat_store_dword v[0:1], v2 9663; GFX7-NEXT: s_endpgm 9664; 9665; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: 9666; GFX10-WGP: ; %bb.0: ; %entry 9667; GFX10-WGP-NEXT: s_clause 0x1 9668; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9669; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 9670; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 9671; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 9672; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 9673; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 9674; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 9675; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 9676; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 9677; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 9678; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 9679; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 9680; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 9681; GFX10-WGP-NEXT: buffer_gl0_inv 9682; GFX10-WGP-NEXT: buffer_gl1_inv 9683; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 9684; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 9685; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 9686; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 9687; GFX10-WGP-NEXT: s_endpgm 9688; 9689; GFX10-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: 9690; GFX10-CU: ; %bb.0: ; %entry 9691; GFX10-CU-NEXT: s_clause 0x1 9692; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9693; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 9694; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 9695; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 9696; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 9697; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 9698; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 9699; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 9700; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 9701; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 9702; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 9703; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 9704; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 9705; GFX10-CU-NEXT: buffer_gl0_inv 9706; GFX10-CU-NEXT: buffer_gl1_inv 9707; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 9708; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 9709; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 9710; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 9711; GFX10-CU-NEXT: s_endpgm 9712; 9713; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: 9714; SKIP-CACHE-INV: ; %bb.0: ; %entry 9715; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 9716; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 9717; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 9718; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 9719; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 9720; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 9721; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 9722; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 9723; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 9724; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 9725; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 9726; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 9727; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 9728; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 9729; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 9730; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 9731; SKIP-CACHE-INV-NEXT: s_endpgm 9732; 9733; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: 9734; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 9735; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9736; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 9737; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9738; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 9739; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 9740; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 9741; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 9742; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 9743; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 9744; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 9745; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol 9746; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9747; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 9748; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 9749; 9750; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: 9751; GFX90A-TGSPLIT: ; %bb.0: ; %entry 9752; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9753; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 9754; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9755; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 9756; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 9757; GFX90A-TGSPLIT-NEXT: buffer_wbl2 9758; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 9759; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 9760; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 9761; GFX90A-TGSPLIT-NEXT: buffer_invl2 9762; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 9763; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 9764; GFX90A-TGSPLIT-NEXT: s_endpgm 9765 i32* %out, i32 %in, i32 %old) { 9766entry: 9767 %gep = getelementptr i32, i32* %out, i32 4 9768 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst seq_cst 9769 %val0 = extractvalue { i32, i1 } %val, 0 9770 store i32 %val0, i32* %out, align 4 9771 ret void 9772} 9773 9774