1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s 3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s 4; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s 5; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s 6; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s 7; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s 8 9define amdgpu_kernel void @flat_singlethread_unordered_load( 10; GFX7-LABEL: flat_singlethread_unordered_load: 11; GFX7: ; %bb.0: ; %entry 12; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 13; GFX7-NEXT: s_waitcnt lgkmcnt(0) 14; GFX7-NEXT: v_mov_b32_e32 v0, s0 15; GFX7-NEXT: v_mov_b32_e32 v1, s1 16; GFX7-NEXT: flat_load_dword v0, v[0:1] 17; GFX7-NEXT: v_mov_b32_e32 v2, s2 18; GFX7-NEXT: v_mov_b32_e32 v3, s3 19; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 20; GFX7-NEXT: flat_store_dword v[2:3], v0 21; GFX7-NEXT: s_endpgm 22; 23; GFX10-WGP-LABEL: flat_singlethread_unordered_load: 24; GFX10-WGP: ; %bb.0: ; %entry 25; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 26; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 27; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 28; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 29; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] 30; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 31; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 32; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 33; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 34; GFX10-WGP-NEXT: s_endpgm 35; 36; GFX10-CU-LABEL: flat_singlethread_unordered_load: 37; GFX10-CU: ; %bb.0: ; %entry 38; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 39; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 40; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 41; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 42; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] 43; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 44; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 45; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 46; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 47; GFX10-CU-NEXT: s_endpgm 48; 49; SKIP-CACHE-INV-LABEL: flat_singlethread_unordered_load: 50; SKIP-CACHE-INV: ; %bb.0: ; %entry 51; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 52; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 53; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 54; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 55; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] 56; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 57; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 58; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 59; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 60; SKIP-CACHE-INV-NEXT: s_endpgm 61; 62; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_unordered_load: 63; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 64; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 65; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 66; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 67; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 68; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] 69; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 70; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 71; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 72; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 73; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 74; 75; GFX90A-TGSPLIT-LABEL: flat_singlethread_unordered_load: 76; GFX90A-TGSPLIT: ; %bb.0: ; %entry 77; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 78; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 79; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 80; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 81; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] 82; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 83; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 84; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 85; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 86; GFX90A-TGSPLIT-NEXT: s_endpgm 87 i32* %in, i32* %out) { 88entry: 89 %val = load atomic i32, i32* %in syncscope("singlethread") unordered, align 4 90 store i32 %val, i32* %out 91 ret void 92} 93 94define amdgpu_kernel void @flat_singlethread_monotonic_load( 95; GFX7-LABEL: flat_singlethread_monotonic_load: 96; GFX7: ; %bb.0: ; %entry 97; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 98; GFX7-NEXT: s_waitcnt lgkmcnt(0) 99; GFX7-NEXT: v_mov_b32_e32 v0, s0 100; GFX7-NEXT: v_mov_b32_e32 v1, s1 101; GFX7-NEXT: flat_load_dword v0, v[0:1] 102; GFX7-NEXT: v_mov_b32_e32 v2, s2 103; GFX7-NEXT: v_mov_b32_e32 v3, s3 104; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 105; GFX7-NEXT: flat_store_dword v[2:3], v0 106; GFX7-NEXT: s_endpgm 107; 108; GFX10-WGP-LABEL: flat_singlethread_monotonic_load: 109; GFX10-WGP: ; %bb.0: ; %entry 110; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 111; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 112; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 113; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 114; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] 115; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 116; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 117; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 118; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 119; GFX10-WGP-NEXT: s_endpgm 120; 121; GFX10-CU-LABEL: flat_singlethread_monotonic_load: 122; GFX10-CU: ; %bb.0: ; %entry 123; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 124; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 125; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 126; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 127; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] 128; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 129; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 130; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 131; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 132; GFX10-CU-NEXT: s_endpgm 133; 134; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_load: 135; SKIP-CACHE-INV: ; %bb.0: ; %entry 136; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 137; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 138; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 139; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 140; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] 141; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 142; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 143; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 144; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 145; SKIP-CACHE-INV-NEXT: s_endpgm 146; 147; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_load: 148; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 149; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 150; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 151; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 152; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 153; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] 154; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 155; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 156; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 157; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 158; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 159; 160; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_load: 161; GFX90A-TGSPLIT: ; %bb.0: ; %entry 162; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 163; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 164; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 165; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 166; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] 167; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 168; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 169; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 170; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 171; GFX90A-TGSPLIT-NEXT: s_endpgm 172 i32* %in, i32* %out) { 173entry: 174 %val = load atomic i32, i32* %in syncscope("singlethread") monotonic, align 4 175 store i32 %val, i32* %out 176 ret void 177} 178 179define amdgpu_kernel void @flat_singlethread_acquire_load( 180; GFX7-LABEL: flat_singlethread_acquire_load: 181; GFX7: ; %bb.0: ; %entry 182; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 183; GFX7-NEXT: s_waitcnt lgkmcnt(0) 184; GFX7-NEXT: v_mov_b32_e32 v0, s0 185; GFX7-NEXT: v_mov_b32_e32 v1, s1 186; GFX7-NEXT: flat_load_dword v0, v[0:1] 187; GFX7-NEXT: v_mov_b32_e32 v2, s2 188; GFX7-NEXT: v_mov_b32_e32 v3, s3 189; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 190; GFX7-NEXT: flat_store_dword v[2:3], v0 191; GFX7-NEXT: s_endpgm 192; 193; GFX10-WGP-LABEL: flat_singlethread_acquire_load: 194; GFX10-WGP: ; %bb.0: ; %entry 195; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 196; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 197; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 198; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 199; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] 200; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 201; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 202; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 203; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 204; GFX10-WGP-NEXT: s_endpgm 205; 206; GFX10-CU-LABEL: flat_singlethread_acquire_load: 207; GFX10-CU: ; %bb.0: ; %entry 208; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 209; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 210; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 211; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 212; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] 213; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 214; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 215; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 216; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 217; GFX10-CU-NEXT: s_endpgm 218; 219; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_load: 220; SKIP-CACHE-INV: ; %bb.0: ; %entry 221; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 222; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 223; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 224; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 225; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] 226; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 227; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 228; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 229; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 230; SKIP-CACHE-INV-NEXT: s_endpgm 231; 232; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_load: 233; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 234; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 235; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 236; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 237; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 238; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] 239; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 240; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 241; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 242; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 243; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 244; 245; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_load: 246; GFX90A-TGSPLIT: ; %bb.0: ; %entry 247; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 248; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 249; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 250; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 251; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] 252; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 253; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 254; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 255; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 256; GFX90A-TGSPLIT-NEXT: s_endpgm 257 i32* %in, i32* %out) { 258entry: 259 %val = load atomic i32, i32* %in syncscope("singlethread") acquire, align 4 260 store i32 %val, i32* %out 261 ret void 262} 263 264define amdgpu_kernel void @flat_singlethread_seq_cst_load( 265; GFX7-LABEL: flat_singlethread_seq_cst_load: 266; GFX7: ; %bb.0: ; %entry 267; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 268; GFX7-NEXT: s_waitcnt lgkmcnt(0) 269; GFX7-NEXT: v_mov_b32_e32 v0, s0 270; GFX7-NEXT: v_mov_b32_e32 v1, s1 271; GFX7-NEXT: flat_load_dword v0, v[0:1] 272; GFX7-NEXT: v_mov_b32_e32 v2, s2 273; GFX7-NEXT: v_mov_b32_e32 v3, s3 274; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 275; GFX7-NEXT: flat_store_dword v[2:3], v0 276; GFX7-NEXT: s_endpgm 277; 278; GFX10-WGP-LABEL: flat_singlethread_seq_cst_load: 279; GFX10-WGP: ; %bb.0: ; %entry 280; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 281; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 282; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 283; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 284; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] 285; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 286; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 287; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 288; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 289; GFX10-WGP-NEXT: s_endpgm 290; 291; GFX10-CU-LABEL: flat_singlethread_seq_cst_load: 292; GFX10-CU: ; %bb.0: ; %entry 293; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 294; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 295; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 296; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 297; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] 298; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 299; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 300; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 301; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 302; GFX10-CU-NEXT: s_endpgm 303; 304; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_load: 305; SKIP-CACHE-INV: ; %bb.0: ; %entry 306; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 307; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 308; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 309; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 310; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] 311; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 312; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 313; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 314; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 315; SKIP-CACHE-INV-NEXT: s_endpgm 316; 317; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_load: 318; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 319; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 320; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 321; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 322; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 323; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] 324; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 325; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 326; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 327; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 328; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 329; 330; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_load: 331; GFX90A-TGSPLIT: ; %bb.0: ; %entry 332; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 333; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 334; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 335; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 336; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] 337; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 338; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 339; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 340; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 341; GFX90A-TGSPLIT-NEXT: s_endpgm 342 i32* %in, i32* %out) { 343entry: 344 %val = load atomic i32, i32* %in syncscope("singlethread") seq_cst, align 4 345 store i32 %val, i32* %out 346 ret void 347} 348 349define amdgpu_kernel void @flat_singlethread_unordered_store( 350; GFX7-LABEL: flat_singlethread_unordered_store: 351; GFX7: ; %bb.0: ; %entry 352; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 353; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 354; GFX7-NEXT: s_waitcnt lgkmcnt(0) 355; GFX7-NEXT: v_mov_b32_e32 v2, s2 356; GFX7-NEXT: v_mov_b32_e32 v0, s0 357; GFX7-NEXT: v_mov_b32_e32 v1, s1 358; GFX7-NEXT: flat_store_dword v[0:1], v2 359; GFX7-NEXT: s_endpgm 360; 361; GFX10-WGP-LABEL: flat_singlethread_unordered_store: 362; GFX10-WGP: ; %bb.0: ; %entry 363; GFX10-WGP-NEXT: s_clause 0x1 364; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 365; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 366; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 367; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 368; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 369; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 370; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 371; GFX10-WGP-NEXT: s_endpgm 372; 373; GFX10-CU-LABEL: flat_singlethread_unordered_store: 374; GFX10-CU: ; %bb.0: ; %entry 375; GFX10-CU-NEXT: s_clause 0x1 376; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 377; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 378; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 379; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 380; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 381; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 382; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 383; GFX10-CU-NEXT: s_endpgm 384; 385; SKIP-CACHE-INV-LABEL: flat_singlethread_unordered_store: 386; SKIP-CACHE-INV: ; %bb.0: ; %entry 387; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 388; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 389; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 390; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 391; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 392; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 393; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 394; SKIP-CACHE-INV-NEXT: s_endpgm 395; 396; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_unordered_store: 397; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 398; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 399; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 400; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 401; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 402; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 403; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 404; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 405; 406; GFX90A-TGSPLIT-LABEL: flat_singlethread_unordered_store: 407; GFX90A-TGSPLIT: ; %bb.0: ; %entry 408; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 409; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 410; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 411; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 412; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 413; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 414; GFX90A-TGSPLIT-NEXT: s_endpgm 415 i32 %in, i32* %out) { 416entry: 417 store atomic i32 %in, i32* %out syncscope("singlethread") unordered, align 4 418 ret void 419} 420 421define amdgpu_kernel void @flat_singlethread_monotonic_store( 422; GFX7-LABEL: flat_singlethread_monotonic_store: 423; GFX7: ; %bb.0: ; %entry 424; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 425; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 426; GFX7-NEXT: s_waitcnt lgkmcnt(0) 427; GFX7-NEXT: v_mov_b32_e32 v2, s2 428; GFX7-NEXT: v_mov_b32_e32 v0, s0 429; GFX7-NEXT: v_mov_b32_e32 v1, s1 430; GFX7-NEXT: flat_store_dword v[0:1], v2 431; GFX7-NEXT: s_endpgm 432; 433; GFX10-WGP-LABEL: flat_singlethread_monotonic_store: 434; GFX10-WGP: ; %bb.0: ; %entry 435; GFX10-WGP-NEXT: s_clause 0x1 436; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 437; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 438; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 439; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 440; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 441; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 442; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 443; GFX10-WGP-NEXT: s_endpgm 444; 445; GFX10-CU-LABEL: flat_singlethread_monotonic_store: 446; GFX10-CU: ; %bb.0: ; %entry 447; GFX10-CU-NEXT: s_clause 0x1 448; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 449; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 450; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 451; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 452; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 453; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 454; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 455; GFX10-CU-NEXT: s_endpgm 456; 457; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_store: 458; SKIP-CACHE-INV: ; %bb.0: ; %entry 459; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 460; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 461; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 462; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 463; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 464; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 465; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 466; SKIP-CACHE-INV-NEXT: s_endpgm 467; 468; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_store: 469; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 470; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 471; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 472; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 473; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 474; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 475; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 476; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 477; 478; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_store: 479; GFX90A-TGSPLIT: ; %bb.0: ; %entry 480; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 481; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 482; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 483; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 484; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 485; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 486; GFX90A-TGSPLIT-NEXT: s_endpgm 487 i32 %in, i32* %out) { 488entry: 489 store atomic i32 %in, i32* %out syncscope("singlethread") monotonic, align 4 490 ret void 491} 492 493define amdgpu_kernel void @flat_singlethread_release_store( 494; GFX7-LABEL: flat_singlethread_release_store: 495; GFX7: ; %bb.0: ; %entry 496; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 497; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 498; GFX7-NEXT: s_waitcnt lgkmcnt(0) 499; GFX7-NEXT: v_mov_b32_e32 v2, s2 500; GFX7-NEXT: v_mov_b32_e32 v0, s0 501; GFX7-NEXT: v_mov_b32_e32 v1, s1 502; GFX7-NEXT: flat_store_dword v[0:1], v2 503; GFX7-NEXT: s_endpgm 504; 505; GFX10-WGP-LABEL: flat_singlethread_release_store: 506; GFX10-WGP: ; %bb.0: ; %entry 507; GFX10-WGP-NEXT: s_clause 0x1 508; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 509; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 510; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 511; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 512; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 513; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 514; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 515; GFX10-WGP-NEXT: s_endpgm 516; 517; GFX10-CU-LABEL: flat_singlethread_release_store: 518; GFX10-CU: ; %bb.0: ; %entry 519; GFX10-CU-NEXT: s_clause 0x1 520; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 521; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 522; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 523; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 524; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 525; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 526; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 527; GFX10-CU-NEXT: s_endpgm 528; 529; SKIP-CACHE-INV-LABEL: flat_singlethread_release_store: 530; SKIP-CACHE-INV: ; %bb.0: ; %entry 531; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 532; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 533; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 534; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 535; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 536; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 537; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 538; SKIP-CACHE-INV-NEXT: s_endpgm 539; 540; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_store: 541; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 542; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 543; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 544; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 545; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 546; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 547; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 548; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 549; 550; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_store: 551; GFX90A-TGSPLIT: ; %bb.0: ; %entry 552; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 553; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 554; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 555; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 556; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 557; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 558; GFX90A-TGSPLIT-NEXT: s_endpgm 559 i32 %in, i32* %out) { 560entry: 561 store atomic i32 %in, i32* %out syncscope("singlethread") release, align 4 562 ret void 563} 564 565define amdgpu_kernel void @flat_singlethread_seq_cst_store( 566; GFX7-LABEL: flat_singlethread_seq_cst_store: 567; GFX7: ; %bb.0: ; %entry 568; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 569; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 570; GFX7-NEXT: s_waitcnt lgkmcnt(0) 571; GFX7-NEXT: v_mov_b32_e32 v2, s2 572; GFX7-NEXT: v_mov_b32_e32 v0, s0 573; GFX7-NEXT: v_mov_b32_e32 v1, s1 574; GFX7-NEXT: flat_store_dword v[0:1], v2 575; GFX7-NEXT: s_endpgm 576; 577; GFX10-WGP-LABEL: flat_singlethread_seq_cst_store: 578; GFX10-WGP: ; %bb.0: ; %entry 579; GFX10-WGP-NEXT: s_clause 0x1 580; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 581; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 582; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 583; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 584; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 585; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 586; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 587; GFX10-WGP-NEXT: s_endpgm 588; 589; GFX10-CU-LABEL: flat_singlethread_seq_cst_store: 590; GFX10-CU: ; %bb.0: ; %entry 591; GFX10-CU-NEXT: s_clause 0x1 592; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 593; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 594; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 595; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 596; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 597; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 598; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 599; GFX10-CU-NEXT: s_endpgm 600; 601; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_store: 602; SKIP-CACHE-INV: ; %bb.0: ; %entry 603; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 604; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 605; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 606; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 607; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 608; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 609; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 610; SKIP-CACHE-INV-NEXT: s_endpgm 611; 612; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_store: 613; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 614; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 615; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 616; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 617; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 618; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 619; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 620; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 621; 622; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_store: 623; GFX90A-TGSPLIT: ; %bb.0: ; %entry 624; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 625; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 626; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 627; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 628; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 629; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 630; GFX90A-TGSPLIT-NEXT: s_endpgm 631 i32 %in, i32* %out) { 632entry: 633 store atomic i32 %in, i32* %out syncscope("singlethread") seq_cst, align 4 634 ret void 635} 636 637define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw( 638; GFX7-LABEL: flat_singlethread_monotonic_atomicrmw: 639; GFX7: ; %bb.0: ; %entry 640; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 641; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 642; GFX7-NEXT: s_waitcnt lgkmcnt(0) 643; GFX7-NEXT: v_mov_b32_e32 v0, s0 644; GFX7-NEXT: v_mov_b32_e32 v1, s1 645; GFX7-NEXT: v_mov_b32_e32 v2, s2 646; GFX7-NEXT: flat_atomic_swap v[0:1], v2 647; GFX7-NEXT: s_endpgm 648; 649; GFX10-WGP-LABEL: flat_singlethread_monotonic_atomicrmw: 650; GFX10-WGP: ; %bb.0: ; %entry 651; GFX10-WGP-NEXT: s_clause 0x1 652; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 653; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 654; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 655; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 656; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 657; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 658; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 659; GFX10-WGP-NEXT: s_endpgm 660; 661; GFX10-CU-LABEL: flat_singlethread_monotonic_atomicrmw: 662; GFX10-CU: ; %bb.0: ; %entry 663; GFX10-CU-NEXT: s_clause 0x1 664; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 665; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 666; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 667; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 668; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 669; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 670; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 671; GFX10-CU-NEXT: s_endpgm 672; 673; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_atomicrmw: 674; SKIP-CACHE-INV: ; %bb.0: ; %entry 675; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 676; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 677; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 678; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 679; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 680; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 681; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 682; SKIP-CACHE-INV-NEXT: s_endpgm 683; 684; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_atomicrmw: 685; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 686; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 687; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 688; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 689; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 690; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 691; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 692; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 693; 694; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_atomicrmw: 695; GFX90A-TGSPLIT: ; %bb.0: ; %entry 696; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 697; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 698; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 699; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 700; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 701; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 702; GFX90A-TGSPLIT-NEXT: s_endpgm 703 i32* %out, i32 %in) { 704entry: 705 %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") monotonic 706 ret void 707} 708 709define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw( 710; GFX7-LABEL: flat_singlethread_acquire_atomicrmw: 711; GFX7: ; %bb.0: ; %entry 712; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 713; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 714; GFX7-NEXT: s_waitcnt lgkmcnt(0) 715; GFX7-NEXT: v_mov_b32_e32 v0, s0 716; GFX7-NEXT: v_mov_b32_e32 v1, s1 717; GFX7-NEXT: v_mov_b32_e32 v2, s2 718; GFX7-NEXT: flat_atomic_swap v[0:1], v2 719; GFX7-NEXT: s_endpgm 720; 721; GFX10-WGP-LABEL: flat_singlethread_acquire_atomicrmw: 722; GFX10-WGP: ; %bb.0: ; %entry 723; GFX10-WGP-NEXT: s_clause 0x1 724; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 725; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 726; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 727; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 728; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 729; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 730; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 731; GFX10-WGP-NEXT: s_endpgm 732; 733; GFX10-CU-LABEL: flat_singlethread_acquire_atomicrmw: 734; GFX10-CU: ; %bb.0: ; %entry 735; GFX10-CU-NEXT: s_clause 0x1 736; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 737; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 738; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 739; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 740; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 741; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 742; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 743; GFX10-CU-NEXT: s_endpgm 744; 745; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_atomicrmw: 746; SKIP-CACHE-INV: ; %bb.0: ; %entry 747; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 748; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 749; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 750; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 751; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 752; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 753; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 754; SKIP-CACHE-INV-NEXT: s_endpgm 755; 756; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw: 757; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 758; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 759; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 760; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 761; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 762; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 763; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 764; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 765; 766; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw: 767; GFX90A-TGSPLIT: ; %bb.0: ; %entry 768; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 769; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 770; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 771; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 772; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 773; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 774; GFX90A-TGSPLIT-NEXT: s_endpgm 775 i32* %out, i32 %in) { 776entry: 777 %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") acquire 778 ret void 779} 780 781define amdgpu_kernel void @flat_singlethread_release_atomicrmw( 782; GFX7-LABEL: flat_singlethread_release_atomicrmw: 783; GFX7: ; %bb.0: ; %entry 784; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 785; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 786; GFX7-NEXT: s_waitcnt lgkmcnt(0) 787; GFX7-NEXT: v_mov_b32_e32 v0, s0 788; GFX7-NEXT: v_mov_b32_e32 v1, s1 789; GFX7-NEXT: v_mov_b32_e32 v2, s2 790; GFX7-NEXT: flat_atomic_swap v[0:1], v2 791; GFX7-NEXT: s_endpgm 792; 793; GFX10-WGP-LABEL: flat_singlethread_release_atomicrmw: 794; GFX10-WGP: ; %bb.0: ; %entry 795; GFX10-WGP-NEXT: s_clause 0x1 796; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 797; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 798; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 799; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 800; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 801; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 802; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 803; GFX10-WGP-NEXT: s_endpgm 804; 805; GFX10-CU-LABEL: flat_singlethread_release_atomicrmw: 806; GFX10-CU: ; %bb.0: ; %entry 807; GFX10-CU-NEXT: s_clause 0x1 808; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 809; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 810; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 811; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 812; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 813; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 814; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 815; GFX10-CU-NEXT: s_endpgm 816; 817; SKIP-CACHE-INV-LABEL: flat_singlethread_release_atomicrmw: 818; SKIP-CACHE-INV: ; %bb.0: ; %entry 819; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 820; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 821; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 822; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 823; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 824; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 825; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 826; SKIP-CACHE-INV-NEXT: s_endpgm 827; 828; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_atomicrmw: 829; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 830; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 831; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 832; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 833; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 834; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 835; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 836; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 837; 838; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_atomicrmw: 839; GFX90A-TGSPLIT: ; %bb.0: ; %entry 840; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 841; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 842; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 843; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 844; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 845; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 846; GFX90A-TGSPLIT-NEXT: s_endpgm 847 i32* %out, i32 %in) { 848entry: 849 %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") release 850 ret void 851} 852 853define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw( 854; GFX7-LABEL: flat_singlethread_acq_rel_atomicrmw: 855; GFX7: ; %bb.0: ; %entry 856; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 857; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 858; GFX7-NEXT: s_waitcnt lgkmcnt(0) 859; GFX7-NEXT: v_mov_b32_e32 v0, s0 860; GFX7-NEXT: v_mov_b32_e32 v1, s1 861; GFX7-NEXT: v_mov_b32_e32 v2, s2 862; GFX7-NEXT: flat_atomic_swap v[0:1], v2 863; GFX7-NEXT: s_endpgm 864; 865; GFX10-WGP-LABEL: flat_singlethread_acq_rel_atomicrmw: 866; GFX10-WGP: ; %bb.0: ; %entry 867; GFX10-WGP-NEXT: s_clause 0x1 868; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 869; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 870; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 871; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 872; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 873; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 874; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 875; GFX10-WGP-NEXT: s_endpgm 876; 877; GFX10-CU-LABEL: flat_singlethread_acq_rel_atomicrmw: 878; GFX10-CU: ; %bb.0: ; %entry 879; GFX10-CU-NEXT: s_clause 0x1 880; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 881; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 882; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 883; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 884; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 885; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 886; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 887; GFX10-CU-NEXT: s_endpgm 888; 889; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_atomicrmw: 890; SKIP-CACHE-INV: ; %bb.0: ; %entry 891; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 892; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 893; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 894; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 895; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 896; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 897; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 898; SKIP-CACHE-INV-NEXT: s_endpgm 899; 900; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw: 901; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 902; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 903; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 904; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 905; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 906; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 907; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 908; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 909; 910; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw: 911; GFX90A-TGSPLIT: ; %bb.0: ; %entry 912; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 913; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 914; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 915; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 916; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 917; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 918; GFX90A-TGSPLIT-NEXT: s_endpgm 919 i32* %out, i32 %in) { 920entry: 921 %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") acq_rel 922 ret void 923} 924 925define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw( 926; GFX7-LABEL: flat_singlethread_seq_cst_atomicrmw: 927; GFX7: ; %bb.0: ; %entry 928; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 929; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 930; GFX7-NEXT: s_waitcnt lgkmcnt(0) 931; GFX7-NEXT: v_mov_b32_e32 v0, s0 932; GFX7-NEXT: v_mov_b32_e32 v1, s1 933; GFX7-NEXT: v_mov_b32_e32 v2, s2 934; GFX7-NEXT: flat_atomic_swap v[0:1], v2 935; GFX7-NEXT: s_endpgm 936; 937; GFX10-WGP-LABEL: flat_singlethread_seq_cst_atomicrmw: 938; GFX10-WGP: ; %bb.0: ; %entry 939; GFX10-WGP-NEXT: s_clause 0x1 940; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 941; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 942; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 943; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 944; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 945; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 946; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 947; GFX10-WGP-NEXT: s_endpgm 948; 949; GFX10-CU-LABEL: flat_singlethread_seq_cst_atomicrmw: 950; GFX10-CU: ; %bb.0: ; %entry 951; GFX10-CU-NEXT: s_clause 0x1 952; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 953; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 954; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 955; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 956; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 957; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 958; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 959; GFX10-CU-NEXT: s_endpgm 960; 961; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_atomicrmw: 962; SKIP-CACHE-INV: ; %bb.0: ; %entry 963; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 964; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 965; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 966; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 967; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 968; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 969; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 970; SKIP-CACHE-INV-NEXT: s_endpgm 971; 972; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw: 973; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 974; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 975; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 976; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 977; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 978; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 979; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 980; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 981; 982; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw: 983; GFX90A-TGSPLIT: ; %bb.0: ; %entry 984; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 985; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 986; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 987; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 988; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 989; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 990; GFX90A-TGSPLIT-NEXT: s_endpgm 991 i32* %out, i32 %in) { 992entry: 993 %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") seq_cst 994 ret void 995} 996 997define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( 998; GFX7-LABEL: flat_singlethread_acquire_ret_atomicrmw: 999; GFX7: ; %bb.0: ; %entry 1000; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1001; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 1002; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1003; GFX7-NEXT: v_mov_b32_e32 v0, s0 1004; GFX7-NEXT: v_mov_b32_e32 v1, s1 1005; GFX7-NEXT: v_mov_b32_e32 v2, s2 1006; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1007; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1008; GFX7-NEXT: flat_store_dword v[0:1], v2 1009; GFX7-NEXT: s_endpgm 1010; 1011; GFX10-WGP-LABEL: flat_singlethread_acquire_ret_atomicrmw: 1012; GFX10-WGP: ; %bb.0: ; %entry 1013; GFX10-WGP-NEXT: s_clause 0x1 1014; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1015; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 1016; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1017; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1018; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 1019; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 1020; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1021; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1022; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 1023; GFX10-WGP-NEXT: s_endpgm 1024; 1025; GFX10-CU-LABEL: flat_singlethread_acquire_ret_atomicrmw: 1026; GFX10-CU: ; %bb.0: ; %entry 1027; GFX10-CU-NEXT: s_clause 0x1 1028; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1029; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 1030; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1031; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1032; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 1033; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 1034; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1035; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1036; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 1037; GFX10-CU-NEXT: s_endpgm 1038; 1039; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_ret_atomicrmw: 1040; SKIP-CACHE-INV: ; %bb.0: ; %entry 1041; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 1042; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 1043; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1044; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1045; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 1046; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 1047; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1048; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1049; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 1050; SKIP-CACHE-INV-NEXT: s_endpgm 1051; 1052; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_ret_atomicrmw: 1053; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 1054; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1055; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 1056; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1057; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1058; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 1059; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1060; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1061; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 1062; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 1063; 1064; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_ret_atomicrmw: 1065; GFX90A-TGSPLIT: ; %bb.0: ; %entry 1066; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1067; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 1068; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1069; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1070; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 1071; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1072; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 1073; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 1074; GFX90A-TGSPLIT-NEXT: s_endpgm 1075 i32* %out, i32 %in) { 1076entry: 1077 %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") acquire 1078 store i32 %val, i32* %out, align 4 1079 ret void 1080} 1081 1082define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( 1083; GFX7-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: 1084; GFX7: ; %bb.0: ; %entry 1085; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1086; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 1087; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1088; GFX7-NEXT: v_mov_b32_e32 v0, s0 1089; GFX7-NEXT: v_mov_b32_e32 v1, s1 1090; GFX7-NEXT: v_mov_b32_e32 v2, s2 1091; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1092; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1093; GFX7-NEXT: flat_store_dword v[0:1], v2 1094; GFX7-NEXT: s_endpgm 1095; 1096; GFX10-WGP-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: 1097; GFX10-WGP: ; %bb.0: ; %entry 1098; GFX10-WGP-NEXT: s_clause 0x1 1099; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1100; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 1101; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1102; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1103; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 1104; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 1105; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1106; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1107; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 1108; GFX10-WGP-NEXT: s_endpgm 1109; 1110; GFX10-CU-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: 1111; GFX10-CU: ; %bb.0: ; %entry 1112; GFX10-CU-NEXT: s_clause 0x1 1113; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1114; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 1115; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1116; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1117; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 1118; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 1119; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1120; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1121; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 1122; GFX10-CU-NEXT: s_endpgm 1123; 1124; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: 1125; SKIP-CACHE-INV: ; %bb.0: ; %entry 1126; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 1127; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 1128; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1129; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1130; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 1131; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 1132; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1133; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1134; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 1135; SKIP-CACHE-INV-NEXT: s_endpgm 1136; 1137; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: 1138; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 1139; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1140; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 1141; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1142; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1143; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 1144; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1145; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1146; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 1147; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 1148; 1149; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: 1150; GFX90A-TGSPLIT: ; %bb.0: ; %entry 1151; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1152; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 1153; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1154; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1155; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 1156; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1157; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 1158; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 1159; GFX90A-TGSPLIT-NEXT: s_endpgm 1160 i32* %out, i32 %in) { 1161entry: 1162 %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") acq_rel 1163 store i32 %val, i32* %out, align 4 1164 ret void 1165} 1166 1167define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( 1168; GFX7-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: 1169; GFX7: ; %bb.0: ; %entry 1170; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1171; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 1172; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1173; GFX7-NEXT: v_mov_b32_e32 v0, s0 1174; GFX7-NEXT: v_mov_b32_e32 v1, s1 1175; GFX7-NEXT: v_mov_b32_e32 v2, s2 1176; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1177; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1178; GFX7-NEXT: flat_store_dword v[0:1], v2 1179; GFX7-NEXT: s_endpgm 1180; 1181; GFX10-WGP-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: 1182; GFX10-WGP: ; %bb.0: ; %entry 1183; GFX10-WGP-NEXT: s_clause 0x1 1184; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1185; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 1186; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1187; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1188; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 1189; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 1190; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1191; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1192; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 1193; GFX10-WGP-NEXT: s_endpgm 1194; 1195; GFX10-CU-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: 1196; GFX10-CU: ; %bb.0: ; %entry 1197; GFX10-CU-NEXT: s_clause 0x1 1198; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1199; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 1200; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1201; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1202; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 1203; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 1204; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1205; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1206; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 1207; GFX10-CU-NEXT: s_endpgm 1208; 1209; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: 1210; SKIP-CACHE-INV: ; %bb.0: ; %entry 1211; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 1212; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 1213; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1214; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1215; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 1216; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 1217; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1218; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1219; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 1220; SKIP-CACHE-INV-NEXT: s_endpgm 1221; 1222; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: 1223; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 1224; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1225; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 1226; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1227; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1228; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 1229; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1230; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1231; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 1232; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 1233; 1234; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: 1235; GFX90A-TGSPLIT: ; %bb.0: ; %entry 1236; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1237; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 1238; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1239; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1240; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 1241; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1242; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 1243; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 1244; GFX90A-TGSPLIT-NEXT: s_endpgm 1245 i32* %out, i32 %in) { 1246entry: 1247 %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") seq_cst 1248 store i32 %val, i32* %out, align 4 1249 ret void 1250} 1251 1252define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg( 1253; GFX7-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: 1254; GFX7: ; %bb.0: ; %entry 1255; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1256; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 1257; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1258; GFX7-NEXT: s_add_u32 s0, s0, 16 1259; GFX7-NEXT: s_addc_u32 s1, s1, 0 1260; GFX7-NEXT: v_mov_b32_e32 v0, s0 1261; GFX7-NEXT: v_mov_b32_e32 v2, s2 1262; GFX7-NEXT: v_mov_b32_e32 v1, s1 1263; GFX7-NEXT: v_mov_b32_e32 v3, s3 1264; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1265; GFX7-NEXT: s_endpgm 1266; 1267; GFX10-WGP-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: 1268; GFX10-WGP: ; %bb.0: ; %entry 1269; GFX10-WGP-NEXT: s_clause 0x1 1270; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1271; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1272; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1273; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 1274; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 1275; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1276; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 1277; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 1278; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 1279; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1280; GFX10-WGP-NEXT: s_endpgm 1281; 1282; GFX10-CU-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: 1283; GFX10-CU: ; %bb.0: ; %entry 1284; GFX10-CU-NEXT: s_clause 0x1 1285; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1286; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1287; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1288; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 1289; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 1290; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1291; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 1292; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 1293; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 1294; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1295; GFX10-CU-NEXT: s_endpgm 1296; 1297; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: 1298; SKIP-CACHE-INV: ; %bb.0: ; %entry 1299; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 1300; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1301; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1302; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 1303; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 1304; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1305; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 1306; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 1307; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 1308; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1309; SKIP-CACHE-INV-NEXT: s_endpgm 1310; 1311; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: 1312; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 1313; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1314; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1315; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1316; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1317; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 1318; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 1319; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 1320; 1321; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: 1322; GFX90A-TGSPLIT: ; %bb.0: ; %entry 1323; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1324; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1325; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1326; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1327; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 1328; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 1329; GFX90A-TGSPLIT-NEXT: s_endpgm 1330 i32* %out, i32 %in, i32 %old) { 1331entry: 1332 %gep = getelementptr i32, i32* %out, i32 4 1333 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") monotonic monotonic 1334 ret void 1335} 1336 1337define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( 1338; GFX7-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: 1339; GFX7: ; %bb.0: ; %entry 1340; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1341; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 1342; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1343; GFX7-NEXT: s_add_u32 s0, s0, 16 1344; GFX7-NEXT: s_addc_u32 s1, s1, 0 1345; GFX7-NEXT: v_mov_b32_e32 v0, s0 1346; GFX7-NEXT: v_mov_b32_e32 v2, s2 1347; GFX7-NEXT: v_mov_b32_e32 v1, s1 1348; GFX7-NEXT: v_mov_b32_e32 v3, s3 1349; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1350; GFX7-NEXT: s_endpgm 1351; 1352; GFX10-WGP-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: 1353; GFX10-WGP: ; %bb.0: ; %entry 1354; GFX10-WGP-NEXT: s_clause 0x1 1355; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1356; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1357; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1358; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 1359; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 1360; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1361; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 1362; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 1363; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 1364; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1365; GFX10-WGP-NEXT: s_endpgm 1366; 1367; GFX10-CU-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: 1368; GFX10-CU: ; %bb.0: ; %entry 1369; GFX10-CU-NEXT: s_clause 0x1 1370; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1371; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1372; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1373; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 1374; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 1375; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1376; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 1377; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 1378; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 1379; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1380; GFX10-CU-NEXT: s_endpgm 1381; 1382; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: 1383; SKIP-CACHE-INV: ; %bb.0: ; %entry 1384; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 1385; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1386; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1387; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 1388; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 1389; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1390; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 1391; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 1392; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 1393; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1394; SKIP-CACHE-INV-NEXT: s_endpgm 1395; 1396; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: 1397; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 1398; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1399; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1400; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1401; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1402; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 1403; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 1404; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 1405; 1406; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: 1407; GFX90A-TGSPLIT: ; %bb.0: ; %entry 1408; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1409; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1410; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1411; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1412; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 1413; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 1414; GFX90A-TGSPLIT-NEXT: s_endpgm 1415 i32* %out, i32 %in, i32 %old) { 1416entry: 1417 %gep = getelementptr i32, i32* %out, i32 4 1418 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acquire monotonic 1419 ret void 1420} 1421 1422define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( 1423; GFX7-LABEL: flat_singlethread_release_monotonic_cmpxchg: 1424; GFX7: ; %bb.0: ; %entry 1425; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1426; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 1427; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1428; GFX7-NEXT: s_add_u32 s0, s0, 16 1429; GFX7-NEXT: s_addc_u32 s1, s1, 0 1430; GFX7-NEXT: v_mov_b32_e32 v0, s0 1431; GFX7-NEXT: v_mov_b32_e32 v2, s2 1432; GFX7-NEXT: v_mov_b32_e32 v1, s1 1433; GFX7-NEXT: v_mov_b32_e32 v3, s3 1434; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1435; GFX7-NEXT: s_endpgm 1436; 1437; GFX10-WGP-LABEL: flat_singlethread_release_monotonic_cmpxchg: 1438; GFX10-WGP: ; %bb.0: ; %entry 1439; GFX10-WGP-NEXT: s_clause 0x1 1440; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1441; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1442; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1443; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 1444; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 1445; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1446; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 1447; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 1448; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 1449; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1450; GFX10-WGP-NEXT: s_endpgm 1451; 1452; GFX10-CU-LABEL: flat_singlethread_release_monotonic_cmpxchg: 1453; GFX10-CU: ; %bb.0: ; %entry 1454; GFX10-CU-NEXT: s_clause 0x1 1455; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1456; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1457; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1458; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 1459; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 1460; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1461; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 1462; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 1463; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 1464; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1465; GFX10-CU-NEXT: s_endpgm 1466; 1467; SKIP-CACHE-INV-LABEL: flat_singlethread_release_monotonic_cmpxchg: 1468; SKIP-CACHE-INV: ; %bb.0: ; %entry 1469; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 1470; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1471; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1472; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 1473; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 1474; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1475; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 1476; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 1477; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 1478; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1479; SKIP-CACHE-INV-NEXT: s_endpgm 1480; 1481; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_monotonic_cmpxchg: 1482; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 1483; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1484; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1485; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1486; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1487; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 1488; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 1489; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 1490; 1491; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_monotonic_cmpxchg: 1492; GFX90A-TGSPLIT: ; %bb.0: ; %entry 1493; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1494; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1495; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1496; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1497; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 1498; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 1499; GFX90A-TGSPLIT-NEXT: s_endpgm 1500 i32* %out, i32 %in, i32 %old) { 1501entry: 1502 %gep = getelementptr i32, i32* %out, i32 4 1503 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") release monotonic 1504 ret void 1505} 1506 1507define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( 1508; GFX7-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: 1509; GFX7: ; %bb.0: ; %entry 1510; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1511; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 1512; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1513; GFX7-NEXT: s_add_u32 s0, s0, 16 1514; GFX7-NEXT: s_addc_u32 s1, s1, 0 1515; GFX7-NEXT: v_mov_b32_e32 v0, s0 1516; GFX7-NEXT: v_mov_b32_e32 v2, s2 1517; GFX7-NEXT: v_mov_b32_e32 v1, s1 1518; GFX7-NEXT: v_mov_b32_e32 v3, s3 1519; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1520; GFX7-NEXT: s_endpgm 1521; 1522; GFX10-WGP-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: 1523; GFX10-WGP: ; %bb.0: ; %entry 1524; GFX10-WGP-NEXT: s_clause 0x1 1525; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1526; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1527; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1528; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 1529; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 1530; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1531; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 1532; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 1533; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 1534; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1535; GFX10-WGP-NEXT: s_endpgm 1536; 1537; GFX10-CU-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: 1538; GFX10-CU: ; %bb.0: ; %entry 1539; GFX10-CU-NEXT: s_clause 0x1 1540; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1541; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1542; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1543; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 1544; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 1545; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1546; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 1547; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 1548; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 1549; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1550; GFX10-CU-NEXT: s_endpgm 1551; 1552; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: 1553; SKIP-CACHE-INV: ; %bb.0: ; %entry 1554; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 1555; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1556; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1557; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 1558; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 1559; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1560; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 1561; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 1562; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 1563; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1564; SKIP-CACHE-INV-NEXT: s_endpgm 1565; 1566; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: 1567; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 1568; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1569; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1570; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1571; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1572; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 1573; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 1574; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 1575; 1576; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: 1577; GFX90A-TGSPLIT: ; %bb.0: ; %entry 1578; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1579; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1580; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1581; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1582; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 1583; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 1584; GFX90A-TGSPLIT-NEXT: s_endpgm 1585 i32* %out, i32 %in, i32 %old) { 1586entry: 1587 %gep = getelementptr i32, i32* %out, i32 4 1588 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel monotonic 1589 ret void 1590} 1591 1592define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( 1593; GFX7-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: 1594; GFX7: ; %bb.0: ; %entry 1595; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1596; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 1597; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1598; GFX7-NEXT: s_add_u32 s0, s0, 16 1599; GFX7-NEXT: s_addc_u32 s1, s1, 0 1600; GFX7-NEXT: v_mov_b32_e32 v0, s0 1601; GFX7-NEXT: v_mov_b32_e32 v2, s2 1602; GFX7-NEXT: v_mov_b32_e32 v1, s1 1603; GFX7-NEXT: v_mov_b32_e32 v3, s3 1604; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1605; GFX7-NEXT: s_endpgm 1606; 1607; GFX10-WGP-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: 1608; GFX10-WGP: ; %bb.0: ; %entry 1609; GFX10-WGP-NEXT: s_clause 0x1 1610; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1611; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1612; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1613; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 1614; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 1615; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1616; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 1617; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 1618; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 1619; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1620; GFX10-WGP-NEXT: s_endpgm 1621; 1622; GFX10-CU-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: 1623; GFX10-CU: ; %bb.0: ; %entry 1624; GFX10-CU-NEXT: s_clause 0x1 1625; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1626; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1627; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1628; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 1629; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 1630; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1631; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 1632; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 1633; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 1634; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1635; GFX10-CU-NEXT: s_endpgm 1636; 1637; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: 1638; SKIP-CACHE-INV: ; %bb.0: ; %entry 1639; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 1640; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1641; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1642; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 1643; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 1644; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1645; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 1646; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 1647; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 1648; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1649; SKIP-CACHE-INV-NEXT: s_endpgm 1650; 1651; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: 1652; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 1653; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1654; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1655; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1656; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1657; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 1658; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 1659; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 1660; 1661; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: 1662; GFX90A-TGSPLIT: ; %bb.0: ; %entry 1663; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1664; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1665; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1666; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1667; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 1668; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 1669; GFX90A-TGSPLIT-NEXT: s_endpgm 1670 i32* %out, i32 %in, i32 %old) { 1671entry: 1672 %gep = getelementptr i32, i32* %out, i32 4 1673 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst monotonic 1674 ret void 1675} 1676 1677define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( 1678; GFX7-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: 1679; GFX7: ; %bb.0: ; %entry 1680; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1681; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 1682; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1683; GFX7-NEXT: s_add_u32 s0, s0, 16 1684; GFX7-NEXT: s_addc_u32 s1, s1, 0 1685; GFX7-NEXT: v_mov_b32_e32 v0, s0 1686; GFX7-NEXT: v_mov_b32_e32 v2, s2 1687; GFX7-NEXT: v_mov_b32_e32 v1, s1 1688; GFX7-NEXT: v_mov_b32_e32 v3, s3 1689; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1690; GFX7-NEXT: s_endpgm 1691; 1692; GFX10-WGP-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: 1693; GFX10-WGP: ; %bb.0: ; %entry 1694; GFX10-WGP-NEXT: s_clause 0x1 1695; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1696; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1697; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1698; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 1699; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 1700; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1701; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 1702; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 1703; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 1704; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1705; GFX10-WGP-NEXT: s_endpgm 1706; 1707; GFX10-CU-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: 1708; GFX10-CU: ; %bb.0: ; %entry 1709; GFX10-CU-NEXT: s_clause 0x1 1710; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1711; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1712; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1713; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 1714; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 1715; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1716; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 1717; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 1718; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 1719; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1720; GFX10-CU-NEXT: s_endpgm 1721; 1722; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: 1723; SKIP-CACHE-INV: ; %bb.0: ; %entry 1724; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 1725; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1726; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1727; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 1728; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 1729; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1730; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 1731; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 1732; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 1733; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1734; SKIP-CACHE-INV-NEXT: s_endpgm 1735; 1736; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: 1737; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 1738; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1739; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1740; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1741; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1742; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 1743; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 1744; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 1745; 1746; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: 1747; GFX90A-TGSPLIT: ; %bb.0: ; %entry 1748; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1749; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1750; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1751; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1752; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 1753; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 1754; GFX90A-TGSPLIT-NEXT: s_endpgm 1755 i32* %out, i32 %in, i32 %old) { 1756entry: 1757 %gep = getelementptr i32, i32* %out, i32 4 1758 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") monotonic acquire 1759 ret void 1760} 1761 1762define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( 1763; GFX7-LABEL: flat_singlethread_acquire_acquire_cmpxchg: 1764; GFX7: ; %bb.0: ; %entry 1765; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1766; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 1767; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1768; GFX7-NEXT: s_add_u32 s0, s0, 16 1769; GFX7-NEXT: s_addc_u32 s1, s1, 0 1770; GFX7-NEXT: v_mov_b32_e32 v0, s0 1771; GFX7-NEXT: v_mov_b32_e32 v2, s2 1772; GFX7-NEXT: v_mov_b32_e32 v1, s1 1773; GFX7-NEXT: v_mov_b32_e32 v3, s3 1774; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1775; GFX7-NEXT: s_endpgm 1776; 1777; GFX10-WGP-LABEL: flat_singlethread_acquire_acquire_cmpxchg: 1778; GFX10-WGP: ; %bb.0: ; %entry 1779; GFX10-WGP-NEXT: s_clause 0x1 1780; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1781; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1782; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1783; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 1784; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 1785; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1786; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 1787; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 1788; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 1789; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1790; GFX10-WGP-NEXT: s_endpgm 1791; 1792; GFX10-CU-LABEL: flat_singlethread_acquire_acquire_cmpxchg: 1793; GFX10-CU: ; %bb.0: ; %entry 1794; GFX10-CU-NEXT: s_clause 0x1 1795; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1796; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1797; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1798; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 1799; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 1800; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1801; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 1802; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 1803; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 1804; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1805; GFX10-CU-NEXT: s_endpgm 1806; 1807; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_acquire_cmpxchg: 1808; SKIP-CACHE-INV: ; %bb.0: ; %entry 1809; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 1810; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1811; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1812; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 1813; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 1814; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1815; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 1816; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 1817; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 1818; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1819; SKIP-CACHE-INV-NEXT: s_endpgm 1820; 1821; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg: 1822; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 1823; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1824; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1825; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1826; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1827; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 1828; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 1829; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 1830; 1831; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg: 1832; GFX90A-TGSPLIT: ; %bb.0: ; %entry 1833; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1834; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1835; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1836; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1837; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 1838; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 1839; GFX90A-TGSPLIT-NEXT: s_endpgm 1840 i32* %out, i32 %in, i32 %old) { 1841entry: 1842 %gep = getelementptr i32, i32* %out, i32 4 1843 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acquire acquire 1844 ret void 1845} 1846 1847define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( 1848; GFX7-LABEL: flat_singlethread_release_acquire_cmpxchg: 1849; GFX7: ; %bb.0: ; %entry 1850; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1851; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 1852; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1853; GFX7-NEXT: s_add_u32 s0, s0, 16 1854; GFX7-NEXT: s_addc_u32 s1, s1, 0 1855; GFX7-NEXT: v_mov_b32_e32 v0, s0 1856; GFX7-NEXT: v_mov_b32_e32 v2, s2 1857; GFX7-NEXT: v_mov_b32_e32 v1, s1 1858; GFX7-NEXT: v_mov_b32_e32 v3, s3 1859; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1860; GFX7-NEXT: s_endpgm 1861; 1862; GFX10-WGP-LABEL: flat_singlethread_release_acquire_cmpxchg: 1863; GFX10-WGP: ; %bb.0: ; %entry 1864; GFX10-WGP-NEXT: s_clause 0x1 1865; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1866; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1867; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1868; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 1869; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 1870; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1871; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 1872; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 1873; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 1874; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1875; GFX10-WGP-NEXT: s_endpgm 1876; 1877; GFX10-CU-LABEL: flat_singlethread_release_acquire_cmpxchg: 1878; GFX10-CU: ; %bb.0: ; %entry 1879; GFX10-CU-NEXT: s_clause 0x1 1880; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1881; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1882; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1883; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 1884; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 1885; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1886; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 1887; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 1888; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 1889; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1890; GFX10-CU-NEXT: s_endpgm 1891; 1892; SKIP-CACHE-INV-LABEL: flat_singlethread_release_acquire_cmpxchg: 1893; SKIP-CACHE-INV: ; %bb.0: ; %entry 1894; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 1895; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1896; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1897; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 1898; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 1899; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1900; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 1901; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 1902; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 1903; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1904; SKIP-CACHE-INV-NEXT: s_endpgm 1905; 1906; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg: 1907; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 1908; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1909; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1910; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1911; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1912; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 1913; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 1914; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 1915; 1916; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg: 1917; GFX90A-TGSPLIT: ; %bb.0: ; %entry 1918; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1919; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1920; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1921; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1922; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 1923; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 1924; GFX90A-TGSPLIT-NEXT: s_endpgm 1925 i32* %out, i32 %in, i32 %old) { 1926entry: 1927 %gep = getelementptr i32, i32* %out, i32 4 1928 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") release acquire 1929 ret void 1930} 1931 1932define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( 1933; GFX7-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: 1934; GFX7: ; %bb.0: ; %entry 1935; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1936; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 1937; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1938; GFX7-NEXT: s_add_u32 s0, s0, 16 1939; GFX7-NEXT: s_addc_u32 s1, s1, 0 1940; GFX7-NEXT: v_mov_b32_e32 v0, s0 1941; GFX7-NEXT: v_mov_b32_e32 v2, s2 1942; GFX7-NEXT: v_mov_b32_e32 v1, s1 1943; GFX7-NEXT: v_mov_b32_e32 v3, s3 1944; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1945; GFX7-NEXT: s_endpgm 1946; 1947; GFX10-WGP-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: 1948; GFX10-WGP: ; %bb.0: ; %entry 1949; GFX10-WGP-NEXT: s_clause 0x1 1950; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1951; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1952; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1953; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 1954; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 1955; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1956; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 1957; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 1958; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 1959; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1960; GFX10-WGP-NEXT: s_endpgm 1961; 1962; GFX10-CU-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: 1963; GFX10-CU: ; %bb.0: ; %entry 1964; GFX10-CU-NEXT: s_clause 0x1 1965; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1966; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1967; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1968; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 1969; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 1970; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1971; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 1972; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 1973; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 1974; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1975; GFX10-CU-NEXT: s_endpgm 1976; 1977; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: 1978; SKIP-CACHE-INV: ; %bb.0: ; %entry 1979; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 1980; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1981; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1982; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 1983; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 1984; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1985; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 1986; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 1987; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 1988; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1989; SKIP-CACHE-INV-NEXT: s_endpgm 1990; 1991; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: 1992; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 1993; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1994; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1995; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1996; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1997; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 1998; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 1999; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 2000; 2001; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: 2002; GFX90A-TGSPLIT: ; %bb.0: ; %entry 2003; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2004; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2005; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2006; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2007; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2008; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2009; GFX90A-TGSPLIT-NEXT: s_endpgm 2010 i32* %out, i32 %in, i32 %old) { 2011entry: 2012 %gep = getelementptr i32, i32* %out, i32 4 2013 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel acquire 2014 ret void 2015} 2016 2017define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( 2018; GFX7-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: 2019; GFX7: ; %bb.0: ; %entry 2020; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2021; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 2022; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2023; GFX7-NEXT: s_add_u32 s0, s0, 16 2024; GFX7-NEXT: s_addc_u32 s1, s1, 0 2025; GFX7-NEXT: v_mov_b32_e32 v0, s0 2026; GFX7-NEXT: v_mov_b32_e32 v2, s2 2027; GFX7-NEXT: v_mov_b32_e32 v1, s1 2028; GFX7-NEXT: v_mov_b32_e32 v3, s3 2029; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2030; GFX7-NEXT: s_endpgm 2031; 2032; GFX10-WGP-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: 2033; GFX10-WGP: ; %bb.0: ; %entry 2034; GFX10-WGP-NEXT: s_clause 0x1 2035; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2036; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2037; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2038; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 2039; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 2040; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 2041; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 2042; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 2043; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 2044; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2045; GFX10-WGP-NEXT: s_endpgm 2046; 2047; GFX10-CU-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: 2048; GFX10-CU: ; %bb.0: ; %entry 2049; GFX10-CU-NEXT: s_clause 0x1 2050; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2051; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2052; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2053; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 2054; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 2055; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 2056; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 2057; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 2058; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 2059; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2060; GFX10-CU-NEXT: s_endpgm 2061; 2062; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: 2063; SKIP-CACHE-INV: ; %bb.0: ; %entry 2064; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 2065; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 2066; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2067; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 2068; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 2069; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 2070; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 2071; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 2072; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 2073; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2074; SKIP-CACHE-INV-NEXT: s_endpgm 2075; 2076; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: 2077; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 2078; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2079; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2080; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2081; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2082; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2083; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2084; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 2085; 2086; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: 2087; GFX90A-TGSPLIT: ; %bb.0: ; %entry 2088; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2089; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2090; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2091; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2092; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2093; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2094; GFX90A-TGSPLIT-NEXT: s_endpgm 2095 i32* %out, i32 %in, i32 %old) { 2096entry: 2097 %gep = getelementptr i32, i32* %out, i32 4 2098 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst acquire 2099 ret void 2100} 2101 2102define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( 2103; GFX7-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: 2104; GFX7: ; %bb.0: ; %entry 2105; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2106; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 2107; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2108; GFX7-NEXT: s_add_u32 s0, s0, 16 2109; GFX7-NEXT: s_addc_u32 s1, s1, 0 2110; GFX7-NEXT: v_mov_b32_e32 v0, s0 2111; GFX7-NEXT: v_mov_b32_e32 v2, s2 2112; GFX7-NEXT: v_mov_b32_e32 v1, s1 2113; GFX7-NEXT: v_mov_b32_e32 v3, s3 2114; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2115; GFX7-NEXT: s_endpgm 2116; 2117; GFX10-WGP-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: 2118; GFX10-WGP: ; %bb.0: ; %entry 2119; GFX10-WGP-NEXT: s_clause 0x1 2120; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2121; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2122; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2123; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 2124; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 2125; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 2126; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 2127; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 2128; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 2129; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2130; GFX10-WGP-NEXT: s_endpgm 2131; 2132; GFX10-CU-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: 2133; GFX10-CU: ; %bb.0: ; %entry 2134; GFX10-CU-NEXT: s_clause 0x1 2135; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2136; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2137; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2138; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 2139; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 2140; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 2141; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 2142; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 2143; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 2144; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2145; GFX10-CU-NEXT: s_endpgm 2146; 2147; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: 2148; SKIP-CACHE-INV: ; %bb.0: ; %entry 2149; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 2150; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 2151; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2152; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 2153; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 2154; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 2155; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 2156; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 2157; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 2158; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2159; SKIP-CACHE-INV-NEXT: s_endpgm 2160; 2161; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: 2162; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 2163; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2164; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2165; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2166; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2167; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2168; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2169; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 2170; 2171; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: 2172; GFX90A-TGSPLIT: ; %bb.0: ; %entry 2173; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2174; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2175; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2176; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2177; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2178; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2179; GFX90A-TGSPLIT-NEXT: s_endpgm 2180 i32* %out, i32 %in, i32 %old) { 2181entry: 2182 %gep = getelementptr i32, i32* %out, i32 4 2183 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") monotonic seq_cst 2184 ret void 2185} 2186 2187define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( 2188; GFX7-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: 2189; GFX7: ; %bb.0: ; %entry 2190; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2191; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 2192; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2193; GFX7-NEXT: s_add_u32 s0, s0, 16 2194; GFX7-NEXT: s_addc_u32 s1, s1, 0 2195; GFX7-NEXT: v_mov_b32_e32 v0, s0 2196; GFX7-NEXT: v_mov_b32_e32 v2, s2 2197; GFX7-NEXT: v_mov_b32_e32 v1, s1 2198; GFX7-NEXT: v_mov_b32_e32 v3, s3 2199; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2200; GFX7-NEXT: s_endpgm 2201; 2202; GFX10-WGP-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: 2203; GFX10-WGP: ; %bb.0: ; %entry 2204; GFX10-WGP-NEXT: s_clause 0x1 2205; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2206; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2207; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2208; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 2209; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 2210; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 2211; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 2212; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 2213; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 2214; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2215; GFX10-WGP-NEXT: s_endpgm 2216; 2217; GFX10-CU-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: 2218; GFX10-CU: ; %bb.0: ; %entry 2219; GFX10-CU-NEXT: s_clause 0x1 2220; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2221; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2222; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2223; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 2224; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 2225; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 2226; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 2227; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 2228; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 2229; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2230; GFX10-CU-NEXT: s_endpgm 2231; 2232; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: 2233; SKIP-CACHE-INV: ; %bb.0: ; %entry 2234; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 2235; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 2236; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2237; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 2238; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 2239; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 2240; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 2241; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 2242; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 2243; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2244; SKIP-CACHE-INV-NEXT: s_endpgm 2245; 2246; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: 2247; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 2248; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2249; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2250; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2251; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2252; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2253; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2254; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 2255; 2256; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: 2257; GFX90A-TGSPLIT: ; %bb.0: ; %entry 2258; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2259; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2260; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2261; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2262; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2263; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2264; GFX90A-TGSPLIT-NEXT: s_endpgm 2265 i32* %out, i32 %in, i32 %old) { 2266entry: 2267 %gep = getelementptr i32, i32* %out, i32 4 2268 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acquire seq_cst 2269 ret void 2270} 2271 2272define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( 2273; GFX7-LABEL: flat_singlethread_release_seq_cst_cmpxchg: 2274; GFX7: ; %bb.0: ; %entry 2275; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2276; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 2277; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2278; GFX7-NEXT: s_add_u32 s0, s0, 16 2279; GFX7-NEXT: s_addc_u32 s1, s1, 0 2280; GFX7-NEXT: v_mov_b32_e32 v0, s0 2281; GFX7-NEXT: v_mov_b32_e32 v2, s2 2282; GFX7-NEXT: v_mov_b32_e32 v1, s1 2283; GFX7-NEXT: v_mov_b32_e32 v3, s3 2284; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2285; GFX7-NEXT: s_endpgm 2286; 2287; GFX10-WGP-LABEL: flat_singlethread_release_seq_cst_cmpxchg: 2288; GFX10-WGP: ; %bb.0: ; %entry 2289; GFX10-WGP-NEXT: s_clause 0x1 2290; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2291; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2292; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2293; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 2294; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 2295; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 2296; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 2297; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 2298; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 2299; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2300; GFX10-WGP-NEXT: s_endpgm 2301; 2302; GFX10-CU-LABEL: flat_singlethread_release_seq_cst_cmpxchg: 2303; GFX10-CU: ; %bb.0: ; %entry 2304; GFX10-CU-NEXT: s_clause 0x1 2305; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2306; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2307; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2308; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 2309; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 2310; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 2311; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 2312; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 2313; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 2314; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2315; GFX10-CU-NEXT: s_endpgm 2316; 2317; SKIP-CACHE-INV-LABEL: flat_singlethread_release_seq_cst_cmpxchg: 2318; SKIP-CACHE-INV: ; %bb.0: ; %entry 2319; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 2320; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 2321; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2322; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 2323; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 2324; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 2325; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 2326; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 2327; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 2328; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2329; SKIP-CACHE-INV-NEXT: s_endpgm 2330; 2331; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg: 2332; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 2333; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2334; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2335; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2336; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2337; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2338; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2339; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 2340; 2341; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg: 2342; GFX90A-TGSPLIT: ; %bb.0: ; %entry 2343; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2344; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2345; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2346; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2347; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2348; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2349; GFX90A-TGSPLIT-NEXT: s_endpgm 2350 i32* %out, i32 %in, i32 %old) { 2351entry: 2352 %gep = getelementptr i32, i32* %out, i32 4 2353 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") release seq_cst 2354 ret void 2355} 2356 2357define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( 2358; GFX7-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: 2359; GFX7: ; %bb.0: ; %entry 2360; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2361; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 2362; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2363; GFX7-NEXT: s_add_u32 s0, s0, 16 2364; GFX7-NEXT: s_addc_u32 s1, s1, 0 2365; GFX7-NEXT: v_mov_b32_e32 v0, s0 2366; GFX7-NEXT: v_mov_b32_e32 v2, s2 2367; GFX7-NEXT: v_mov_b32_e32 v1, s1 2368; GFX7-NEXT: v_mov_b32_e32 v3, s3 2369; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2370; GFX7-NEXT: s_endpgm 2371; 2372; GFX10-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: 2373; GFX10-WGP: ; %bb.0: ; %entry 2374; GFX10-WGP-NEXT: s_clause 0x1 2375; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2376; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2377; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2378; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 2379; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 2380; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 2381; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 2382; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 2383; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 2384; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2385; GFX10-WGP-NEXT: s_endpgm 2386; 2387; GFX10-CU-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: 2388; GFX10-CU: ; %bb.0: ; %entry 2389; GFX10-CU-NEXT: s_clause 0x1 2390; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2391; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2392; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2393; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 2394; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 2395; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 2396; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 2397; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 2398; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 2399; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2400; GFX10-CU-NEXT: s_endpgm 2401; 2402; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: 2403; SKIP-CACHE-INV: ; %bb.0: ; %entry 2404; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 2405; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 2406; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2407; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 2408; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 2409; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 2410; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 2411; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 2412; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 2413; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2414; SKIP-CACHE-INV-NEXT: s_endpgm 2415; 2416; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: 2417; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 2418; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2419; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2420; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2421; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2422; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2423; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2424; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 2425; 2426; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: 2427; GFX90A-TGSPLIT: ; %bb.0: ; %entry 2428; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2429; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2430; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2431; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2432; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2433; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2434; GFX90A-TGSPLIT-NEXT: s_endpgm 2435 i32* %out, i32 %in, i32 %old) { 2436entry: 2437 %gep = getelementptr i32, i32* %out, i32 4 2438 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel seq_cst 2439 ret void 2440} 2441 2442define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( 2443; GFX7-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: 2444; GFX7: ; %bb.0: ; %entry 2445; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2446; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 2447; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2448; GFX7-NEXT: s_add_u32 s0, s0, 16 2449; GFX7-NEXT: s_addc_u32 s1, s1, 0 2450; GFX7-NEXT: v_mov_b32_e32 v0, s0 2451; GFX7-NEXT: v_mov_b32_e32 v2, s2 2452; GFX7-NEXT: v_mov_b32_e32 v1, s1 2453; GFX7-NEXT: v_mov_b32_e32 v3, s3 2454; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2455; GFX7-NEXT: s_endpgm 2456; 2457; GFX10-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: 2458; GFX10-WGP: ; %bb.0: ; %entry 2459; GFX10-WGP-NEXT: s_clause 0x1 2460; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2461; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2462; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2463; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 2464; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 2465; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 2466; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 2467; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 2468; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 2469; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2470; GFX10-WGP-NEXT: s_endpgm 2471; 2472; GFX10-CU-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: 2473; GFX10-CU: ; %bb.0: ; %entry 2474; GFX10-CU-NEXT: s_clause 0x1 2475; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2476; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2477; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2478; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 2479; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 2480; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 2481; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 2482; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 2483; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 2484; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2485; GFX10-CU-NEXT: s_endpgm 2486; 2487; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: 2488; SKIP-CACHE-INV: ; %bb.0: ; %entry 2489; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 2490; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 2491; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2492; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 2493; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 2494; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 2495; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 2496; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 2497; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 2498; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2499; SKIP-CACHE-INV-NEXT: s_endpgm 2500; 2501; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: 2502; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 2503; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2504; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2505; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2506; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2507; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2508; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2509; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 2510; 2511; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: 2512; GFX90A-TGSPLIT: ; %bb.0: ; %entry 2513; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2514; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2515; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2516; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2517; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2518; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2519; GFX90A-TGSPLIT-NEXT: s_endpgm 2520 i32* %out, i32 %in, i32 %old) { 2521entry: 2522 %gep = getelementptr i32, i32* %out, i32 4 2523 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst seq_cst 2524 ret void 2525} 2526 2527define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg( 2528; GFX7-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: 2529; GFX7: ; %bb.0: ; %entry 2530; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2531; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 2532; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2533; GFX7-NEXT: s_add_u32 s4, s0, 16 2534; GFX7-NEXT: s_addc_u32 s5, s1, 0 2535; GFX7-NEXT: v_mov_b32_e32 v0, s4 2536; GFX7-NEXT: v_mov_b32_e32 v2, s2 2537; GFX7-NEXT: v_mov_b32_e32 v1, s5 2538; GFX7-NEXT: v_mov_b32_e32 v3, s3 2539; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2540; GFX7-NEXT: v_mov_b32_e32 v0, s0 2541; GFX7-NEXT: v_mov_b32_e32 v1, s1 2542; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2543; GFX7-NEXT: flat_store_dword v[0:1], v2 2544; GFX7-NEXT: s_endpgm 2545; 2546; GFX10-WGP-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: 2547; GFX10-WGP: ; %bb.0: ; %entry 2548; GFX10-WGP-NEXT: s_clause 0x1 2549; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2550; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2551; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2552; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 2553; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 2554; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 2555; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 2556; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 2557; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 2558; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2559; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 2560; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 2561; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2562; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 2563; GFX10-WGP-NEXT: s_endpgm 2564; 2565; GFX10-CU-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: 2566; GFX10-CU: ; %bb.0: ; %entry 2567; GFX10-CU-NEXT: s_clause 0x1 2568; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2569; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2570; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2571; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 2572; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 2573; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 2574; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 2575; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 2576; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 2577; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2578; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 2579; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 2580; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2581; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 2582; GFX10-CU-NEXT: s_endpgm 2583; 2584; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: 2585; SKIP-CACHE-INV: ; %bb.0: ; %entry 2586; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 2587; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 2588; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2589; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 2590; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 2591; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 2592; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 2593; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 2594; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 2595; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2596; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 2597; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 2598; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2599; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 2600; SKIP-CACHE-INV-NEXT: s_endpgm 2601; 2602; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: 2603; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 2604; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2605; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2606; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2607; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2608; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2609; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 2610; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2611; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 2612; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 2613; 2614; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: 2615; GFX90A-TGSPLIT: ; %bb.0: ; %entry 2616; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2617; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2618; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2619; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2620; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2621; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 2622; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 2623; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 2624; GFX90A-TGSPLIT-NEXT: s_endpgm 2625 i32* %out, i32 %in, i32 %old) { 2626entry: 2627 %gep = getelementptr i32, i32* %out, i32 4 2628 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") monotonic monotonic 2629 %val0 = extractvalue { i32, i1 } %val, 0 2630 store i32 %val0, i32* %out, align 4 2631 ret void 2632} 2633 2634define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( 2635; GFX7-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: 2636; GFX7: ; %bb.0: ; %entry 2637; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2638; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 2639; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2640; GFX7-NEXT: s_add_u32 s4, s0, 16 2641; GFX7-NEXT: s_addc_u32 s5, s1, 0 2642; GFX7-NEXT: v_mov_b32_e32 v0, s4 2643; GFX7-NEXT: v_mov_b32_e32 v2, s2 2644; GFX7-NEXT: v_mov_b32_e32 v1, s5 2645; GFX7-NEXT: v_mov_b32_e32 v3, s3 2646; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2647; GFX7-NEXT: v_mov_b32_e32 v0, s0 2648; GFX7-NEXT: v_mov_b32_e32 v1, s1 2649; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2650; GFX7-NEXT: flat_store_dword v[0:1], v2 2651; GFX7-NEXT: s_endpgm 2652; 2653; GFX10-WGP-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: 2654; GFX10-WGP: ; %bb.0: ; %entry 2655; GFX10-WGP-NEXT: s_clause 0x1 2656; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2657; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2658; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2659; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 2660; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 2661; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 2662; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 2663; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 2664; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 2665; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2666; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 2667; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 2668; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2669; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 2670; GFX10-WGP-NEXT: s_endpgm 2671; 2672; GFX10-CU-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: 2673; GFX10-CU: ; %bb.0: ; %entry 2674; GFX10-CU-NEXT: s_clause 0x1 2675; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2676; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2677; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2678; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 2679; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 2680; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 2681; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 2682; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 2683; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 2684; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2685; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 2686; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 2687; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2688; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 2689; GFX10-CU-NEXT: s_endpgm 2690; 2691; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: 2692; SKIP-CACHE-INV: ; %bb.0: ; %entry 2693; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 2694; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 2695; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2696; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 2697; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 2698; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 2699; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 2700; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 2701; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 2702; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2703; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 2704; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 2705; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2706; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 2707; SKIP-CACHE-INV-NEXT: s_endpgm 2708; 2709; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: 2710; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 2711; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2712; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2713; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2714; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2715; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2716; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 2717; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2718; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 2719; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 2720; 2721; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: 2722; GFX90A-TGSPLIT: ; %bb.0: ; %entry 2723; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2724; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2725; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2726; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2727; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2728; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 2729; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 2730; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 2731; GFX90A-TGSPLIT-NEXT: s_endpgm 2732 i32* %out, i32 %in, i32 %old) { 2733entry: 2734 %gep = getelementptr i32, i32* %out, i32 4 2735 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acquire monotonic 2736 %val0 = extractvalue { i32, i1 } %val, 0 2737 store i32 %val0, i32* %out, align 4 2738 ret void 2739} 2740 2741define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( 2742; GFX7-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: 2743; GFX7: ; %bb.0: ; %entry 2744; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2745; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 2746; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2747; GFX7-NEXT: s_add_u32 s4, s0, 16 2748; GFX7-NEXT: s_addc_u32 s5, s1, 0 2749; GFX7-NEXT: v_mov_b32_e32 v0, s4 2750; GFX7-NEXT: v_mov_b32_e32 v2, s2 2751; GFX7-NEXT: v_mov_b32_e32 v1, s5 2752; GFX7-NEXT: v_mov_b32_e32 v3, s3 2753; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2754; GFX7-NEXT: v_mov_b32_e32 v0, s0 2755; GFX7-NEXT: v_mov_b32_e32 v1, s1 2756; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2757; GFX7-NEXT: flat_store_dword v[0:1], v2 2758; GFX7-NEXT: s_endpgm 2759; 2760; GFX10-WGP-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: 2761; GFX10-WGP: ; %bb.0: ; %entry 2762; GFX10-WGP-NEXT: s_clause 0x1 2763; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2764; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2765; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2766; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 2767; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 2768; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 2769; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 2770; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 2771; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 2772; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2773; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 2774; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 2775; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2776; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 2777; GFX10-WGP-NEXT: s_endpgm 2778; 2779; GFX10-CU-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: 2780; GFX10-CU: ; %bb.0: ; %entry 2781; GFX10-CU-NEXT: s_clause 0x1 2782; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2783; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2784; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2785; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 2786; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 2787; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 2788; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 2789; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 2790; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 2791; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2792; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 2793; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 2794; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2795; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 2796; GFX10-CU-NEXT: s_endpgm 2797; 2798; SKIP-CACHE-INV-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: 2799; SKIP-CACHE-INV: ; %bb.0: ; %entry 2800; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 2801; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 2802; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2803; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 2804; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 2805; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 2806; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 2807; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 2808; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 2809; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2810; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 2811; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 2812; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2813; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 2814; SKIP-CACHE-INV-NEXT: s_endpgm 2815; 2816; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: 2817; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 2818; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2819; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2820; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2821; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2822; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2823; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 2824; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2825; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 2826; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 2827; 2828; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: 2829; GFX90A-TGSPLIT: ; %bb.0: ; %entry 2830; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2831; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2832; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2833; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2834; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2835; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 2836; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 2837; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 2838; GFX90A-TGSPLIT-NEXT: s_endpgm 2839 i32* %out, i32 %in, i32 %old) { 2840entry: 2841 %gep = getelementptr i32, i32* %out, i32 4 2842 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") release monotonic 2843 %val0 = extractvalue { i32, i1 } %val, 0 2844 store i32 %val0, i32* %out, align 4 2845 ret void 2846} 2847 2848define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( 2849; GFX7-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: 2850; GFX7: ; %bb.0: ; %entry 2851; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2852; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 2853; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2854; GFX7-NEXT: s_add_u32 s4, s0, 16 2855; GFX7-NEXT: s_addc_u32 s5, s1, 0 2856; GFX7-NEXT: v_mov_b32_e32 v0, s4 2857; GFX7-NEXT: v_mov_b32_e32 v2, s2 2858; GFX7-NEXT: v_mov_b32_e32 v1, s5 2859; GFX7-NEXT: v_mov_b32_e32 v3, s3 2860; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2861; GFX7-NEXT: v_mov_b32_e32 v0, s0 2862; GFX7-NEXT: v_mov_b32_e32 v1, s1 2863; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2864; GFX7-NEXT: flat_store_dword v[0:1], v2 2865; GFX7-NEXT: s_endpgm 2866; 2867; GFX10-WGP-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: 2868; GFX10-WGP: ; %bb.0: ; %entry 2869; GFX10-WGP-NEXT: s_clause 0x1 2870; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2871; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2872; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2873; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 2874; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 2875; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 2876; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 2877; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 2878; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 2879; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2880; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 2881; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 2882; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2883; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 2884; GFX10-WGP-NEXT: s_endpgm 2885; 2886; GFX10-CU-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: 2887; GFX10-CU: ; %bb.0: ; %entry 2888; GFX10-CU-NEXT: s_clause 0x1 2889; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2890; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2891; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2892; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 2893; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 2894; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 2895; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 2896; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 2897; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 2898; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2899; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 2900; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 2901; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2902; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 2903; GFX10-CU-NEXT: s_endpgm 2904; 2905; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: 2906; SKIP-CACHE-INV: ; %bb.0: ; %entry 2907; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 2908; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 2909; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2910; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 2911; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 2912; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 2913; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 2914; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 2915; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 2916; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2917; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 2918; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 2919; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2920; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 2921; SKIP-CACHE-INV-NEXT: s_endpgm 2922; 2923; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: 2924; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 2925; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2926; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2927; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2928; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2929; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2930; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 2931; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2932; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 2933; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 2934; 2935; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: 2936; GFX90A-TGSPLIT: ; %bb.0: ; %entry 2937; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2938; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2939; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2940; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2941; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2942; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 2943; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 2944; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 2945; GFX90A-TGSPLIT-NEXT: s_endpgm 2946 i32* %out, i32 %in, i32 %old) { 2947entry: 2948 %gep = getelementptr i32, i32* %out, i32 4 2949 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel monotonic 2950 %val0 = extractvalue { i32, i1 } %val, 0 2951 store i32 %val0, i32* %out, align 4 2952 ret void 2953} 2954 2955define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( 2956; GFX7-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: 2957; GFX7: ; %bb.0: ; %entry 2958; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2959; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 2960; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2961; GFX7-NEXT: s_add_u32 s4, s0, 16 2962; GFX7-NEXT: s_addc_u32 s5, s1, 0 2963; GFX7-NEXT: v_mov_b32_e32 v0, s4 2964; GFX7-NEXT: v_mov_b32_e32 v2, s2 2965; GFX7-NEXT: v_mov_b32_e32 v1, s5 2966; GFX7-NEXT: v_mov_b32_e32 v3, s3 2967; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2968; GFX7-NEXT: v_mov_b32_e32 v0, s0 2969; GFX7-NEXT: v_mov_b32_e32 v1, s1 2970; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2971; GFX7-NEXT: flat_store_dword v[0:1], v2 2972; GFX7-NEXT: s_endpgm 2973; 2974; GFX10-WGP-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: 2975; GFX10-WGP: ; %bb.0: ; %entry 2976; GFX10-WGP-NEXT: s_clause 0x1 2977; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2978; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2979; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2980; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 2981; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 2982; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 2983; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 2984; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 2985; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 2986; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2987; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 2988; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 2989; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2990; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 2991; GFX10-WGP-NEXT: s_endpgm 2992; 2993; GFX10-CU-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: 2994; GFX10-CU: ; %bb.0: ; %entry 2995; GFX10-CU-NEXT: s_clause 0x1 2996; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2997; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2998; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2999; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 3000; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 3001; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 3002; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 3003; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 3004; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 3005; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3006; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 3007; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 3008; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3009; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 3010; GFX10-CU-NEXT: s_endpgm 3011; 3012; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: 3013; SKIP-CACHE-INV: ; %bb.0: ; %entry 3014; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 3015; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 3016; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3017; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 3018; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 3019; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 3020; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 3021; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 3022; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 3023; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3024; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 3025; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 3026; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3027; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 3028; SKIP-CACHE-INV-NEXT: s_endpgm 3029; 3030; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: 3031; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 3032; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3033; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3034; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3035; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 3036; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 3037; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 3038; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3039; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 3040; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 3041; 3042; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: 3043; GFX90A-TGSPLIT: ; %bb.0: ; %entry 3044; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3045; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3046; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3047; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 3048; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 3049; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 3050; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 3051; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 3052; GFX90A-TGSPLIT-NEXT: s_endpgm 3053 i32* %out, i32 %in, i32 %old) { 3054entry: 3055 %gep = getelementptr i32, i32* %out, i32 4 3056 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst monotonic 3057 %val0 = extractvalue { i32, i1 } %val, 0 3058 store i32 %val0, i32* %out, align 4 3059 ret void 3060} 3061 3062define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( 3063; GFX7-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: 3064; GFX7: ; %bb.0: ; %entry 3065; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3066; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 3067; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3068; GFX7-NEXT: s_add_u32 s4, s0, 16 3069; GFX7-NEXT: s_addc_u32 s5, s1, 0 3070; GFX7-NEXT: v_mov_b32_e32 v0, s4 3071; GFX7-NEXT: v_mov_b32_e32 v2, s2 3072; GFX7-NEXT: v_mov_b32_e32 v1, s5 3073; GFX7-NEXT: v_mov_b32_e32 v3, s3 3074; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3075; GFX7-NEXT: v_mov_b32_e32 v0, s0 3076; GFX7-NEXT: v_mov_b32_e32 v1, s1 3077; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3078; GFX7-NEXT: flat_store_dword v[0:1], v2 3079; GFX7-NEXT: s_endpgm 3080; 3081; GFX10-WGP-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: 3082; GFX10-WGP: ; %bb.0: ; %entry 3083; GFX10-WGP-NEXT: s_clause 0x1 3084; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3085; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3086; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3087; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 3088; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 3089; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 3090; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 3091; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 3092; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 3093; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3094; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 3095; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 3096; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3097; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 3098; GFX10-WGP-NEXT: s_endpgm 3099; 3100; GFX10-CU-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: 3101; GFX10-CU: ; %bb.0: ; %entry 3102; GFX10-CU-NEXT: s_clause 0x1 3103; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3104; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3105; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3106; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 3107; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 3108; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 3109; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 3110; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 3111; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 3112; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3113; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 3114; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 3115; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3116; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 3117; GFX10-CU-NEXT: s_endpgm 3118; 3119; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: 3120; SKIP-CACHE-INV: ; %bb.0: ; %entry 3121; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 3122; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 3123; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3124; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 3125; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 3126; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 3127; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 3128; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 3129; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 3130; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3131; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 3132; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 3133; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3134; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 3135; SKIP-CACHE-INV-NEXT: s_endpgm 3136; 3137; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: 3138; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 3139; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3140; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3141; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3142; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 3143; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 3144; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 3145; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3146; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 3147; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 3148; 3149; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: 3150; GFX90A-TGSPLIT: ; %bb.0: ; %entry 3151; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3152; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3153; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3154; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 3155; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 3156; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 3157; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 3158; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 3159; GFX90A-TGSPLIT-NEXT: s_endpgm 3160 i32* %out, i32 %in, i32 %old) { 3161entry: 3162 %gep = getelementptr i32, i32* %out, i32 4 3163 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") monotonic acquire 3164 %val0 = extractvalue { i32, i1 } %val, 0 3165 store i32 %val0, i32* %out, align 4 3166 ret void 3167} 3168 3169define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( 3170; GFX7-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: 3171; GFX7: ; %bb.0: ; %entry 3172; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3173; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 3174; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3175; GFX7-NEXT: s_add_u32 s4, s0, 16 3176; GFX7-NEXT: s_addc_u32 s5, s1, 0 3177; GFX7-NEXT: v_mov_b32_e32 v0, s4 3178; GFX7-NEXT: v_mov_b32_e32 v2, s2 3179; GFX7-NEXT: v_mov_b32_e32 v1, s5 3180; GFX7-NEXT: v_mov_b32_e32 v3, s3 3181; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3182; GFX7-NEXT: v_mov_b32_e32 v0, s0 3183; GFX7-NEXT: v_mov_b32_e32 v1, s1 3184; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3185; GFX7-NEXT: flat_store_dword v[0:1], v2 3186; GFX7-NEXT: s_endpgm 3187; 3188; GFX10-WGP-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: 3189; GFX10-WGP: ; %bb.0: ; %entry 3190; GFX10-WGP-NEXT: s_clause 0x1 3191; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3192; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3193; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3194; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 3195; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 3196; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 3197; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 3198; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 3199; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 3200; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3201; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 3202; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 3203; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3204; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 3205; GFX10-WGP-NEXT: s_endpgm 3206; 3207; GFX10-CU-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: 3208; GFX10-CU: ; %bb.0: ; %entry 3209; GFX10-CU-NEXT: s_clause 0x1 3210; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3211; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3212; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3213; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 3214; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 3215; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 3216; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 3217; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 3218; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 3219; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3220; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 3221; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 3222; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3223; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 3224; GFX10-CU-NEXT: s_endpgm 3225; 3226; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: 3227; SKIP-CACHE-INV: ; %bb.0: ; %entry 3228; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 3229; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 3230; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3231; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 3232; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 3233; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 3234; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 3235; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 3236; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 3237; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3238; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 3239; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 3240; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3241; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 3242; SKIP-CACHE-INV-NEXT: s_endpgm 3243; 3244; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: 3245; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 3246; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3247; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3248; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3249; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 3250; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 3251; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 3252; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3253; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 3254; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 3255; 3256; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: 3257; GFX90A-TGSPLIT: ; %bb.0: ; %entry 3258; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3259; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3260; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3261; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 3262; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 3263; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 3264; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 3265; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 3266; GFX90A-TGSPLIT-NEXT: s_endpgm 3267 i32* %out, i32 %in, i32 %old) { 3268entry: 3269 %gep = getelementptr i32, i32* %out, i32 4 3270 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acquire acquire 3271 %val0 = extractvalue { i32, i1 } %val, 0 3272 store i32 %val0, i32* %out, align 4 3273 ret void 3274} 3275 3276define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( 3277; GFX7-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: 3278; GFX7: ; %bb.0: ; %entry 3279; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3280; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 3281; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3282; GFX7-NEXT: s_add_u32 s4, s0, 16 3283; GFX7-NEXT: s_addc_u32 s5, s1, 0 3284; GFX7-NEXT: v_mov_b32_e32 v0, s4 3285; GFX7-NEXT: v_mov_b32_e32 v2, s2 3286; GFX7-NEXT: v_mov_b32_e32 v1, s5 3287; GFX7-NEXT: v_mov_b32_e32 v3, s3 3288; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3289; GFX7-NEXT: v_mov_b32_e32 v0, s0 3290; GFX7-NEXT: v_mov_b32_e32 v1, s1 3291; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3292; GFX7-NEXT: flat_store_dword v[0:1], v2 3293; GFX7-NEXT: s_endpgm 3294; 3295; GFX10-WGP-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: 3296; GFX10-WGP: ; %bb.0: ; %entry 3297; GFX10-WGP-NEXT: s_clause 0x1 3298; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3299; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3300; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3301; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 3302; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 3303; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 3304; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 3305; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 3306; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 3307; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3308; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 3309; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 3310; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3311; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 3312; GFX10-WGP-NEXT: s_endpgm 3313; 3314; GFX10-CU-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: 3315; GFX10-CU: ; %bb.0: ; %entry 3316; GFX10-CU-NEXT: s_clause 0x1 3317; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3318; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3319; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3320; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 3321; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 3322; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 3323; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 3324; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 3325; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 3326; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3327; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 3328; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 3329; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3330; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 3331; GFX10-CU-NEXT: s_endpgm 3332; 3333; SKIP-CACHE-INV-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: 3334; SKIP-CACHE-INV: ; %bb.0: ; %entry 3335; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 3336; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 3337; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3338; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 3339; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 3340; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 3341; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 3342; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 3343; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 3344; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3345; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 3346; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 3347; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3348; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 3349; SKIP-CACHE-INV-NEXT: s_endpgm 3350; 3351; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: 3352; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 3353; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3354; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3355; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3356; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 3357; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 3358; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 3359; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3360; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 3361; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 3362; 3363; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: 3364; GFX90A-TGSPLIT: ; %bb.0: ; %entry 3365; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3366; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3367; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3368; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 3369; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 3370; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 3371; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 3372; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 3373; GFX90A-TGSPLIT-NEXT: s_endpgm 3374 i32* %out, i32 %in, i32 %old) { 3375entry: 3376 %gep = getelementptr i32, i32* %out, i32 4 3377 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") release acquire 3378 %val0 = extractvalue { i32, i1 } %val, 0 3379 store i32 %val0, i32* %out, align 4 3380 ret void 3381} 3382 3383define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( 3384; GFX7-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: 3385; GFX7: ; %bb.0: ; %entry 3386; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3387; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 3388; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3389; GFX7-NEXT: s_add_u32 s4, s0, 16 3390; GFX7-NEXT: s_addc_u32 s5, s1, 0 3391; GFX7-NEXT: v_mov_b32_e32 v0, s4 3392; GFX7-NEXT: v_mov_b32_e32 v2, s2 3393; GFX7-NEXT: v_mov_b32_e32 v1, s5 3394; GFX7-NEXT: v_mov_b32_e32 v3, s3 3395; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3396; GFX7-NEXT: v_mov_b32_e32 v0, s0 3397; GFX7-NEXT: v_mov_b32_e32 v1, s1 3398; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3399; GFX7-NEXT: flat_store_dword v[0:1], v2 3400; GFX7-NEXT: s_endpgm 3401; 3402; GFX10-WGP-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: 3403; GFX10-WGP: ; %bb.0: ; %entry 3404; GFX10-WGP-NEXT: s_clause 0x1 3405; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3406; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3407; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3408; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 3409; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 3410; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 3411; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 3412; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 3413; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 3414; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3415; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 3416; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 3417; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3418; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 3419; GFX10-WGP-NEXT: s_endpgm 3420; 3421; GFX10-CU-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: 3422; GFX10-CU: ; %bb.0: ; %entry 3423; GFX10-CU-NEXT: s_clause 0x1 3424; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3425; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3426; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3427; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 3428; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 3429; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 3430; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 3431; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 3432; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 3433; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3434; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 3435; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 3436; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3437; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 3438; GFX10-CU-NEXT: s_endpgm 3439; 3440; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: 3441; SKIP-CACHE-INV: ; %bb.0: ; %entry 3442; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 3443; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 3444; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3445; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 3446; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 3447; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 3448; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 3449; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 3450; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 3451; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3452; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 3453; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 3454; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3455; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 3456; SKIP-CACHE-INV-NEXT: s_endpgm 3457; 3458; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: 3459; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 3460; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3461; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3462; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3463; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 3464; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 3465; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 3466; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3467; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 3468; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 3469; 3470; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: 3471; GFX90A-TGSPLIT: ; %bb.0: ; %entry 3472; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3473; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3474; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3475; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 3476; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 3477; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 3478; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 3479; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 3480; GFX90A-TGSPLIT-NEXT: s_endpgm 3481 i32* %out, i32 %in, i32 %old) { 3482entry: 3483 %gep = getelementptr i32, i32* %out, i32 4 3484 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel acquire 3485 %val0 = extractvalue { i32, i1 } %val, 0 3486 store i32 %val0, i32* %out, align 4 3487 ret void 3488} 3489 3490define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( 3491; GFX7-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: 3492; GFX7: ; %bb.0: ; %entry 3493; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3494; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 3495; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3496; GFX7-NEXT: s_add_u32 s4, s0, 16 3497; GFX7-NEXT: s_addc_u32 s5, s1, 0 3498; GFX7-NEXT: v_mov_b32_e32 v0, s4 3499; GFX7-NEXT: v_mov_b32_e32 v2, s2 3500; GFX7-NEXT: v_mov_b32_e32 v1, s5 3501; GFX7-NEXT: v_mov_b32_e32 v3, s3 3502; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3503; GFX7-NEXT: v_mov_b32_e32 v0, s0 3504; GFX7-NEXT: v_mov_b32_e32 v1, s1 3505; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3506; GFX7-NEXT: flat_store_dword v[0:1], v2 3507; GFX7-NEXT: s_endpgm 3508; 3509; GFX10-WGP-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: 3510; GFX10-WGP: ; %bb.0: ; %entry 3511; GFX10-WGP-NEXT: s_clause 0x1 3512; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3513; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3514; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3515; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 3516; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 3517; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 3518; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 3519; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 3520; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 3521; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3522; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 3523; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 3524; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3525; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 3526; GFX10-WGP-NEXT: s_endpgm 3527; 3528; GFX10-CU-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: 3529; GFX10-CU: ; %bb.0: ; %entry 3530; GFX10-CU-NEXT: s_clause 0x1 3531; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3532; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3533; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3534; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 3535; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 3536; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 3537; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 3538; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 3539; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 3540; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3541; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 3542; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 3543; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3544; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 3545; GFX10-CU-NEXT: s_endpgm 3546; 3547; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: 3548; SKIP-CACHE-INV: ; %bb.0: ; %entry 3549; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 3550; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 3551; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3552; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 3553; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 3554; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 3555; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 3556; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 3557; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 3558; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3559; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 3560; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 3561; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3562; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 3563; SKIP-CACHE-INV-NEXT: s_endpgm 3564; 3565; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: 3566; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 3567; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3568; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3569; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3570; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 3571; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 3572; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 3573; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3574; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 3575; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 3576; 3577; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: 3578; GFX90A-TGSPLIT: ; %bb.0: ; %entry 3579; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3580; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3581; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3582; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 3583; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 3584; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 3585; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 3586; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 3587; GFX90A-TGSPLIT-NEXT: s_endpgm 3588 i32* %out, i32 %in, i32 %old) { 3589entry: 3590 %gep = getelementptr i32, i32* %out, i32 4 3591 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst acquire 3592 %val0 = extractvalue { i32, i1 } %val, 0 3593 store i32 %val0, i32* %out, align 4 3594 ret void 3595} 3596 3597define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( 3598; GFX7-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: 3599; GFX7: ; %bb.0: ; %entry 3600; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3601; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 3602; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3603; GFX7-NEXT: s_add_u32 s4, s0, 16 3604; GFX7-NEXT: s_addc_u32 s5, s1, 0 3605; GFX7-NEXT: v_mov_b32_e32 v0, s4 3606; GFX7-NEXT: v_mov_b32_e32 v2, s2 3607; GFX7-NEXT: v_mov_b32_e32 v1, s5 3608; GFX7-NEXT: v_mov_b32_e32 v3, s3 3609; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3610; GFX7-NEXT: v_mov_b32_e32 v0, s0 3611; GFX7-NEXT: v_mov_b32_e32 v1, s1 3612; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3613; GFX7-NEXT: flat_store_dword v[0:1], v2 3614; GFX7-NEXT: s_endpgm 3615; 3616; GFX10-WGP-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: 3617; GFX10-WGP: ; %bb.0: ; %entry 3618; GFX10-WGP-NEXT: s_clause 0x1 3619; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3620; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3621; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3622; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 3623; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 3624; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 3625; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 3626; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 3627; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 3628; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3629; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 3630; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 3631; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3632; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 3633; GFX10-WGP-NEXT: s_endpgm 3634; 3635; GFX10-CU-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: 3636; GFX10-CU: ; %bb.0: ; %entry 3637; GFX10-CU-NEXT: s_clause 0x1 3638; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3639; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3640; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3641; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 3642; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 3643; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 3644; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 3645; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 3646; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 3647; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3648; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 3649; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 3650; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3651; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 3652; GFX10-CU-NEXT: s_endpgm 3653; 3654; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: 3655; SKIP-CACHE-INV: ; %bb.0: ; %entry 3656; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 3657; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 3658; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3659; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 3660; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 3661; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 3662; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 3663; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 3664; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 3665; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3666; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 3667; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 3668; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3669; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 3670; SKIP-CACHE-INV-NEXT: s_endpgm 3671; 3672; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: 3673; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 3674; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3675; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3676; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3677; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 3678; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 3679; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 3680; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3681; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 3682; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 3683; 3684; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: 3685; GFX90A-TGSPLIT: ; %bb.0: ; %entry 3686; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3687; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3688; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3689; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 3690; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 3691; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 3692; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 3693; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 3694; GFX90A-TGSPLIT-NEXT: s_endpgm 3695 i32* %out, i32 %in, i32 %old) { 3696entry: 3697 %gep = getelementptr i32, i32* %out, i32 4 3698 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") monotonic seq_cst 3699 %val0 = extractvalue { i32, i1 } %val, 0 3700 store i32 %val0, i32* %out, align 4 3701 ret void 3702} 3703 3704define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( 3705; GFX7-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: 3706; GFX7: ; %bb.0: ; %entry 3707; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3708; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 3709; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3710; GFX7-NEXT: s_add_u32 s4, s0, 16 3711; GFX7-NEXT: s_addc_u32 s5, s1, 0 3712; GFX7-NEXT: v_mov_b32_e32 v0, s4 3713; GFX7-NEXT: v_mov_b32_e32 v2, s2 3714; GFX7-NEXT: v_mov_b32_e32 v1, s5 3715; GFX7-NEXT: v_mov_b32_e32 v3, s3 3716; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3717; GFX7-NEXT: v_mov_b32_e32 v0, s0 3718; GFX7-NEXT: v_mov_b32_e32 v1, s1 3719; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3720; GFX7-NEXT: flat_store_dword v[0:1], v2 3721; GFX7-NEXT: s_endpgm 3722; 3723; GFX10-WGP-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: 3724; GFX10-WGP: ; %bb.0: ; %entry 3725; GFX10-WGP-NEXT: s_clause 0x1 3726; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3727; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3728; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3729; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 3730; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 3731; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 3732; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 3733; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 3734; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 3735; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3736; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 3737; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 3738; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3739; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 3740; GFX10-WGP-NEXT: s_endpgm 3741; 3742; GFX10-CU-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: 3743; GFX10-CU: ; %bb.0: ; %entry 3744; GFX10-CU-NEXT: s_clause 0x1 3745; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3746; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3747; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3748; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 3749; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 3750; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 3751; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 3752; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 3753; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 3754; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3755; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 3756; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 3757; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3758; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 3759; GFX10-CU-NEXT: s_endpgm 3760; 3761; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: 3762; SKIP-CACHE-INV: ; %bb.0: ; %entry 3763; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 3764; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 3765; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3766; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 3767; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 3768; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 3769; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 3770; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 3771; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 3772; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3773; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 3774; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 3775; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3776; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 3777; SKIP-CACHE-INV-NEXT: s_endpgm 3778; 3779; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: 3780; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 3781; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3782; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3783; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3784; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 3785; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 3786; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 3787; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3788; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 3789; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 3790; 3791; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: 3792; GFX90A-TGSPLIT: ; %bb.0: ; %entry 3793; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3794; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3795; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3796; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 3797; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 3798; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 3799; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 3800; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 3801; GFX90A-TGSPLIT-NEXT: s_endpgm 3802 i32* %out, i32 %in, i32 %old) { 3803entry: 3804 %gep = getelementptr i32, i32* %out, i32 4 3805 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acquire seq_cst 3806 %val0 = extractvalue { i32, i1 } %val, 0 3807 store i32 %val0, i32* %out, align 4 3808 ret void 3809} 3810 3811define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( 3812; GFX7-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: 3813; GFX7: ; %bb.0: ; %entry 3814; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3815; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 3816; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3817; GFX7-NEXT: s_add_u32 s4, s0, 16 3818; GFX7-NEXT: s_addc_u32 s5, s1, 0 3819; GFX7-NEXT: v_mov_b32_e32 v0, s4 3820; GFX7-NEXT: v_mov_b32_e32 v2, s2 3821; GFX7-NEXT: v_mov_b32_e32 v1, s5 3822; GFX7-NEXT: v_mov_b32_e32 v3, s3 3823; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3824; GFX7-NEXT: v_mov_b32_e32 v0, s0 3825; GFX7-NEXT: v_mov_b32_e32 v1, s1 3826; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3827; GFX7-NEXT: flat_store_dword v[0:1], v2 3828; GFX7-NEXT: s_endpgm 3829; 3830; GFX10-WGP-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: 3831; GFX10-WGP: ; %bb.0: ; %entry 3832; GFX10-WGP-NEXT: s_clause 0x1 3833; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3834; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3835; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3836; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 3837; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 3838; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 3839; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 3840; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 3841; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 3842; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3843; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 3844; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 3845; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3846; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 3847; GFX10-WGP-NEXT: s_endpgm 3848; 3849; GFX10-CU-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: 3850; GFX10-CU: ; %bb.0: ; %entry 3851; GFX10-CU-NEXT: s_clause 0x1 3852; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3853; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3854; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3855; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 3856; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 3857; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 3858; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 3859; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 3860; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 3861; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3862; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 3863; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 3864; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3865; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 3866; GFX10-CU-NEXT: s_endpgm 3867; 3868; SKIP-CACHE-INV-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: 3869; SKIP-CACHE-INV: ; %bb.0: ; %entry 3870; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 3871; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 3872; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3873; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 3874; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 3875; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 3876; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 3877; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 3878; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 3879; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3880; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 3881; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 3882; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3883; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 3884; SKIP-CACHE-INV-NEXT: s_endpgm 3885; 3886; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: 3887; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 3888; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3889; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3890; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3891; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 3892; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 3893; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 3894; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3895; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 3896; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 3897; 3898; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: 3899; GFX90A-TGSPLIT: ; %bb.0: ; %entry 3900; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3901; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3902; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3903; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 3904; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 3905; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 3906; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 3907; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 3908; GFX90A-TGSPLIT-NEXT: s_endpgm 3909 i32* %out, i32 %in, i32 %old) { 3910entry: 3911 %gep = getelementptr i32, i32* %out, i32 4 3912 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") release seq_cst 3913 %val0 = extractvalue { i32, i1 } %val, 0 3914 store i32 %val0, i32* %out, align 4 3915 ret void 3916} 3917 3918define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( 3919; GFX7-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: 3920; GFX7: ; %bb.0: ; %entry 3921; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3922; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 3923; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3924; GFX7-NEXT: s_add_u32 s4, s0, 16 3925; GFX7-NEXT: s_addc_u32 s5, s1, 0 3926; GFX7-NEXT: v_mov_b32_e32 v0, s4 3927; GFX7-NEXT: v_mov_b32_e32 v2, s2 3928; GFX7-NEXT: v_mov_b32_e32 v1, s5 3929; GFX7-NEXT: v_mov_b32_e32 v3, s3 3930; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3931; GFX7-NEXT: v_mov_b32_e32 v0, s0 3932; GFX7-NEXT: v_mov_b32_e32 v1, s1 3933; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3934; GFX7-NEXT: flat_store_dword v[0:1], v2 3935; GFX7-NEXT: s_endpgm 3936; 3937; GFX10-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: 3938; GFX10-WGP: ; %bb.0: ; %entry 3939; GFX10-WGP-NEXT: s_clause 0x1 3940; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3941; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3942; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3943; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 3944; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 3945; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 3946; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 3947; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 3948; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 3949; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3950; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 3951; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 3952; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3953; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 3954; GFX10-WGP-NEXT: s_endpgm 3955; 3956; GFX10-CU-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: 3957; GFX10-CU: ; %bb.0: ; %entry 3958; GFX10-CU-NEXT: s_clause 0x1 3959; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3960; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3961; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3962; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 3963; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 3964; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 3965; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 3966; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 3967; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 3968; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3969; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 3970; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 3971; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3972; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 3973; GFX10-CU-NEXT: s_endpgm 3974; 3975; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: 3976; SKIP-CACHE-INV: ; %bb.0: ; %entry 3977; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 3978; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 3979; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3980; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 3981; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 3982; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 3983; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 3984; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 3985; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 3986; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 3987; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 3988; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 3989; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3990; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 3991; SKIP-CACHE-INV-NEXT: s_endpgm 3992; 3993; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: 3994; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 3995; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3996; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3997; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3998; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 3999; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 4000; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 4001; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4002; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 4003; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 4004; 4005; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: 4006; GFX90A-TGSPLIT: ; %bb.0: ; %entry 4007; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4008; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4009; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4010; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 4011; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 4012; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 4013; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 4014; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 4015; GFX90A-TGSPLIT-NEXT: s_endpgm 4016 i32* %out, i32 %in, i32 %old) { 4017entry: 4018 %gep = getelementptr i32, i32* %out, i32 4 4019 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel seq_cst 4020 %val0 = extractvalue { i32, i1 } %val, 0 4021 store i32 %val0, i32* %out, align 4 4022 ret void 4023} 4024 4025define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( 4026; GFX7-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: 4027; GFX7: ; %bb.0: ; %entry 4028; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4029; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 4030; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4031; GFX7-NEXT: s_add_u32 s4, s0, 16 4032; GFX7-NEXT: s_addc_u32 s5, s1, 0 4033; GFX7-NEXT: v_mov_b32_e32 v0, s4 4034; GFX7-NEXT: v_mov_b32_e32 v2, s2 4035; GFX7-NEXT: v_mov_b32_e32 v1, s5 4036; GFX7-NEXT: v_mov_b32_e32 v3, s3 4037; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4038; GFX7-NEXT: v_mov_b32_e32 v0, s0 4039; GFX7-NEXT: v_mov_b32_e32 v1, s1 4040; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4041; GFX7-NEXT: flat_store_dword v[0:1], v2 4042; GFX7-NEXT: s_endpgm 4043; 4044; GFX10-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: 4045; GFX10-WGP: ; %bb.0: ; %entry 4046; GFX10-WGP-NEXT: s_clause 0x1 4047; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4048; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4049; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4050; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 4051; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 4052; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 4053; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 4054; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 4055; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 4056; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4057; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4058; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 4059; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4060; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 4061; GFX10-WGP-NEXT: s_endpgm 4062; 4063; GFX10-CU-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: 4064; GFX10-CU: ; %bb.0: ; %entry 4065; GFX10-CU-NEXT: s_clause 0x1 4066; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4067; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4068; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4069; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 4070; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 4071; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 4072; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 4073; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 4074; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 4075; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4076; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 4077; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 4078; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4079; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 4080; GFX10-CU-NEXT: s_endpgm 4081; 4082; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: 4083; SKIP-CACHE-INV: ; %bb.0: ; %entry 4084; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 4085; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4086; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4087; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 4088; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 4089; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 4090; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 4091; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 4092; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 4093; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4094; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 4095; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 4096; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4097; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 4098; SKIP-CACHE-INV-NEXT: s_endpgm 4099; 4100; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: 4101; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 4102; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4103; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4104; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4105; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 4106; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 4107; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 4108; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4109; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 4110; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 4111; 4112; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: 4113; GFX90A-TGSPLIT: ; %bb.0: ; %entry 4114; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4115; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4116; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4117; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 4118; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 4119; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 4120; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 4121; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 4122; GFX90A-TGSPLIT-NEXT: s_endpgm 4123 i32* %out, i32 %in, i32 %old) { 4124entry: 4125 %gep = getelementptr i32, i32* %out, i32 4 4126 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst seq_cst 4127 %val0 = extractvalue { i32, i1 } %val, 0 4128 store i32 %val0, i32* %out, align 4 4129 ret void 4130} 4131 4132define amdgpu_kernel void @flat_singlethread_one_as_unordered_load( 4133; GFX7-LABEL: flat_singlethread_one_as_unordered_load: 4134; GFX7: ; %bb.0: ; %entry 4135; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 4136; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4137; GFX7-NEXT: v_mov_b32_e32 v0, s0 4138; GFX7-NEXT: v_mov_b32_e32 v1, s1 4139; GFX7-NEXT: flat_load_dword v0, v[0:1] 4140; GFX7-NEXT: v_mov_b32_e32 v2, s2 4141; GFX7-NEXT: v_mov_b32_e32 v3, s3 4142; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4143; GFX7-NEXT: flat_store_dword v[2:3], v0 4144; GFX7-NEXT: s_endpgm 4145; 4146; GFX10-WGP-LABEL: flat_singlethread_one_as_unordered_load: 4147; GFX10-WGP: ; %bb.0: ; %entry 4148; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 4149; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4150; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4151; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 4152; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] 4153; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 4154; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 4155; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4156; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 4157; GFX10-WGP-NEXT: s_endpgm 4158; 4159; GFX10-CU-LABEL: flat_singlethread_one_as_unordered_load: 4160; GFX10-CU: ; %bb.0: ; %entry 4161; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 4162; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4163; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 4164; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 4165; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] 4166; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 4167; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 4168; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4169; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 4170; GFX10-CU-NEXT: s_endpgm 4171; 4172; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_unordered_load: 4173; SKIP-CACHE-INV: ; %bb.0: ; %entry 4174; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 4175; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4176; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 4177; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 4178; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] 4179; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 4180; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 4181; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4182; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 4183; SKIP-CACHE-INV-NEXT: s_endpgm 4184; 4185; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_unordered_load: 4186; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 4187; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 4188; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4189; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 4190; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 4191; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] 4192; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 4193; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 4194; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4195; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 4196; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 4197; 4198; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_unordered_load: 4199; GFX90A-TGSPLIT: ; %bb.0: ; %entry 4200; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 4201; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4202; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 4203; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 4204; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] 4205; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 4206; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 4207; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 4208; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 4209; GFX90A-TGSPLIT-NEXT: s_endpgm 4210 i32* %in, i32* %out) { 4211entry: 4212 %val = load atomic i32, i32* %in syncscope("singlethread-one-as") unordered, align 4 4213 store i32 %val, i32* %out 4214 ret void 4215} 4216 4217define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load( 4218; GFX7-LABEL: flat_singlethread_one_as_monotonic_load: 4219; GFX7: ; %bb.0: ; %entry 4220; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 4221; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4222; GFX7-NEXT: v_mov_b32_e32 v0, s0 4223; GFX7-NEXT: v_mov_b32_e32 v1, s1 4224; GFX7-NEXT: flat_load_dword v0, v[0:1] 4225; GFX7-NEXT: v_mov_b32_e32 v2, s2 4226; GFX7-NEXT: v_mov_b32_e32 v3, s3 4227; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4228; GFX7-NEXT: flat_store_dword v[2:3], v0 4229; GFX7-NEXT: s_endpgm 4230; 4231; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_load: 4232; GFX10-WGP: ; %bb.0: ; %entry 4233; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 4234; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4235; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4236; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 4237; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] 4238; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 4239; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 4240; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4241; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 4242; GFX10-WGP-NEXT: s_endpgm 4243; 4244; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_load: 4245; GFX10-CU: ; %bb.0: ; %entry 4246; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 4247; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4248; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 4249; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 4250; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] 4251; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 4252; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 4253; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4254; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 4255; GFX10-CU-NEXT: s_endpgm 4256; 4257; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_load: 4258; SKIP-CACHE-INV: ; %bb.0: ; %entry 4259; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 4260; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4261; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 4262; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 4263; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] 4264; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 4265; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 4266; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4267; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 4268; SKIP-CACHE-INV-NEXT: s_endpgm 4269; 4270; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_load: 4271; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 4272; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 4273; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4274; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 4275; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 4276; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] 4277; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 4278; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 4279; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4280; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 4281; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 4282; 4283; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_load: 4284; GFX90A-TGSPLIT: ; %bb.0: ; %entry 4285; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 4286; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4287; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 4288; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 4289; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] 4290; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 4291; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 4292; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 4293; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 4294; GFX90A-TGSPLIT-NEXT: s_endpgm 4295 i32* %in, i32* %out) { 4296entry: 4297 %val = load atomic i32, i32* %in syncscope("singlethread-one-as") monotonic, align 4 4298 store i32 %val, i32* %out 4299 ret void 4300} 4301 4302define amdgpu_kernel void @flat_singlethread_one_as_acquire_load( 4303; GFX7-LABEL: flat_singlethread_one_as_acquire_load: 4304; GFX7: ; %bb.0: ; %entry 4305; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 4306; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4307; GFX7-NEXT: v_mov_b32_e32 v0, s0 4308; GFX7-NEXT: v_mov_b32_e32 v1, s1 4309; GFX7-NEXT: flat_load_dword v0, v[0:1] 4310; GFX7-NEXT: v_mov_b32_e32 v2, s2 4311; GFX7-NEXT: v_mov_b32_e32 v3, s3 4312; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4313; GFX7-NEXT: flat_store_dword v[2:3], v0 4314; GFX7-NEXT: s_endpgm 4315; 4316; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_load: 4317; GFX10-WGP: ; %bb.0: ; %entry 4318; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 4319; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4320; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4321; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 4322; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] 4323; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 4324; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 4325; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4326; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 4327; GFX10-WGP-NEXT: s_endpgm 4328; 4329; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_load: 4330; GFX10-CU: ; %bb.0: ; %entry 4331; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 4332; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4333; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 4334; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 4335; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] 4336; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 4337; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 4338; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4339; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 4340; GFX10-CU-NEXT: s_endpgm 4341; 4342; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_load: 4343; SKIP-CACHE-INV: ; %bb.0: ; %entry 4344; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 4345; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4346; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 4347; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 4348; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] 4349; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 4350; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 4351; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4352; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 4353; SKIP-CACHE-INV-NEXT: s_endpgm 4354; 4355; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_load: 4356; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 4357; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 4358; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4359; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 4360; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 4361; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] 4362; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 4363; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 4364; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4365; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 4366; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 4367; 4368; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_load: 4369; GFX90A-TGSPLIT: ; %bb.0: ; %entry 4370; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 4371; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4372; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 4373; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 4374; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] 4375; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 4376; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 4377; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 4378; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 4379; GFX90A-TGSPLIT-NEXT: s_endpgm 4380 i32* %in, i32* %out) { 4381entry: 4382 %val = load atomic i32, i32* %in syncscope("singlethread-one-as") acquire, align 4 4383 store i32 %val, i32* %out 4384 ret void 4385} 4386 4387define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load( 4388; GFX7-LABEL: flat_singlethread_one_as_seq_cst_load: 4389; GFX7: ; %bb.0: ; %entry 4390; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 4391; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4392; GFX7-NEXT: v_mov_b32_e32 v0, s0 4393; GFX7-NEXT: v_mov_b32_e32 v1, s1 4394; GFX7-NEXT: flat_load_dword v0, v[0:1] 4395; GFX7-NEXT: v_mov_b32_e32 v2, s2 4396; GFX7-NEXT: v_mov_b32_e32 v3, s3 4397; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4398; GFX7-NEXT: flat_store_dword v[2:3], v0 4399; GFX7-NEXT: s_endpgm 4400; 4401; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_load: 4402; GFX10-WGP: ; %bb.0: ; %entry 4403; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 4404; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4405; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4406; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 4407; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] 4408; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 4409; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 4410; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4411; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 4412; GFX10-WGP-NEXT: s_endpgm 4413; 4414; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_load: 4415; GFX10-CU: ; %bb.0: ; %entry 4416; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 4417; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4418; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 4419; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 4420; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] 4421; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 4422; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 4423; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4424; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 4425; GFX10-CU-NEXT: s_endpgm 4426; 4427; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_load: 4428; SKIP-CACHE-INV: ; %bb.0: ; %entry 4429; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 4430; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4431; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 4432; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 4433; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] 4434; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 4435; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 4436; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4437; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 4438; SKIP-CACHE-INV-NEXT: s_endpgm 4439; 4440; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_load: 4441; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 4442; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 4443; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4444; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 4445; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 4446; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] 4447; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 4448; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 4449; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4450; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 4451; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 4452; 4453; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_load: 4454; GFX90A-TGSPLIT: ; %bb.0: ; %entry 4455; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 4456; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4457; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 4458; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 4459; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] 4460; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 4461; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 4462; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 4463; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 4464; GFX90A-TGSPLIT-NEXT: s_endpgm 4465 i32* %in, i32* %out) { 4466entry: 4467 %val = load atomic i32, i32* %in syncscope("singlethread-one-as") seq_cst, align 4 4468 store i32 %val, i32* %out 4469 ret void 4470} 4471 4472define amdgpu_kernel void @flat_singlethread_one_as_unordered_store( 4473; GFX7-LABEL: flat_singlethread_one_as_unordered_store: 4474; GFX7: ; %bb.0: ; %entry 4475; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 4476; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 4477; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4478; GFX7-NEXT: v_mov_b32_e32 v2, s2 4479; GFX7-NEXT: v_mov_b32_e32 v0, s0 4480; GFX7-NEXT: v_mov_b32_e32 v1, s1 4481; GFX7-NEXT: flat_store_dword v[0:1], v2 4482; GFX7-NEXT: s_endpgm 4483; 4484; GFX10-WGP-LABEL: flat_singlethread_one_as_unordered_store: 4485; GFX10-WGP: ; %bb.0: ; %entry 4486; GFX10-WGP-NEXT: s_clause 0x1 4487; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 4488; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 4489; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4490; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4491; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 4492; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 4493; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 4494; GFX10-WGP-NEXT: s_endpgm 4495; 4496; GFX10-CU-LABEL: flat_singlethread_one_as_unordered_store: 4497; GFX10-CU: ; %bb.0: ; %entry 4498; GFX10-CU-NEXT: s_clause 0x1 4499; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 4500; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 4501; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4502; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 4503; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 4504; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 4505; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 4506; GFX10-CU-NEXT: s_endpgm 4507; 4508; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_unordered_store: 4509; SKIP-CACHE-INV: ; %bb.0: ; %entry 4510; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 4511; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4512; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4513; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 4514; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 4515; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 4516; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 4517; SKIP-CACHE-INV-NEXT: s_endpgm 4518; 4519; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_unordered_store: 4520; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 4521; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 4522; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 4523; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4524; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 4525; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 4526; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 4527; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 4528; 4529; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_unordered_store: 4530; GFX90A-TGSPLIT: ; %bb.0: ; %entry 4531; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 4532; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 4533; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4534; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 4535; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 4536; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 4537; GFX90A-TGSPLIT-NEXT: s_endpgm 4538 i32 %in, i32* %out) { 4539entry: 4540 store atomic i32 %in, i32* %out syncscope("singlethread-one-as") unordered, align 4 4541 ret void 4542} 4543 4544define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store( 4545; GFX7-LABEL: flat_singlethread_one_as_monotonic_store: 4546; GFX7: ; %bb.0: ; %entry 4547; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 4548; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 4549; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4550; GFX7-NEXT: v_mov_b32_e32 v2, s2 4551; GFX7-NEXT: v_mov_b32_e32 v0, s0 4552; GFX7-NEXT: v_mov_b32_e32 v1, s1 4553; GFX7-NEXT: flat_store_dword v[0:1], v2 4554; GFX7-NEXT: s_endpgm 4555; 4556; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_store: 4557; GFX10-WGP: ; %bb.0: ; %entry 4558; GFX10-WGP-NEXT: s_clause 0x1 4559; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 4560; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 4561; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4562; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4563; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 4564; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 4565; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 4566; GFX10-WGP-NEXT: s_endpgm 4567; 4568; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_store: 4569; GFX10-CU: ; %bb.0: ; %entry 4570; GFX10-CU-NEXT: s_clause 0x1 4571; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 4572; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 4573; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4574; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 4575; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 4576; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 4577; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 4578; GFX10-CU-NEXT: s_endpgm 4579; 4580; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_store: 4581; SKIP-CACHE-INV: ; %bb.0: ; %entry 4582; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 4583; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4584; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4585; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 4586; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 4587; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 4588; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 4589; SKIP-CACHE-INV-NEXT: s_endpgm 4590; 4591; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_store: 4592; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 4593; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 4594; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 4595; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4596; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 4597; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 4598; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 4599; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 4600; 4601; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_store: 4602; GFX90A-TGSPLIT: ; %bb.0: ; %entry 4603; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 4604; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 4605; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4606; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 4607; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 4608; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 4609; GFX90A-TGSPLIT-NEXT: s_endpgm 4610 i32 %in, i32* %out) { 4611entry: 4612 store atomic i32 %in, i32* %out syncscope("singlethread-one-as") monotonic, align 4 4613 ret void 4614} 4615 4616define amdgpu_kernel void @flat_singlethread_one_as_release_store( 4617; GFX7-LABEL: flat_singlethread_one_as_release_store: 4618; GFX7: ; %bb.0: ; %entry 4619; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 4620; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 4621; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4622; GFX7-NEXT: v_mov_b32_e32 v2, s2 4623; GFX7-NEXT: v_mov_b32_e32 v0, s0 4624; GFX7-NEXT: v_mov_b32_e32 v1, s1 4625; GFX7-NEXT: flat_store_dword v[0:1], v2 4626; GFX7-NEXT: s_endpgm 4627; 4628; GFX10-WGP-LABEL: flat_singlethread_one_as_release_store: 4629; GFX10-WGP: ; %bb.0: ; %entry 4630; GFX10-WGP-NEXT: s_clause 0x1 4631; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 4632; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 4633; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4634; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4635; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 4636; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 4637; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 4638; GFX10-WGP-NEXT: s_endpgm 4639; 4640; GFX10-CU-LABEL: flat_singlethread_one_as_release_store: 4641; GFX10-CU: ; %bb.0: ; %entry 4642; GFX10-CU-NEXT: s_clause 0x1 4643; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 4644; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 4645; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4646; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 4647; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 4648; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 4649; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 4650; GFX10-CU-NEXT: s_endpgm 4651; 4652; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_store: 4653; SKIP-CACHE-INV: ; %bb.0: ; %entry 4654; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 4655; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4656; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4657; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 4658; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 4659; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 4660; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 4661; SKIP-CACHE-INV-NEXT: s_endpgm 4662; 4663; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_store: 4664; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 4665; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 4666; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 4667; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4668; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 4669; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 4670; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 4671; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 4672; 4673; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_store: 4674; GFX90A-TGSPLIT: ; %bb.0: ; %entry 4675; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 4676; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 4677; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4678; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 4679; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 4680; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 4681; GFX90A-TGSPLIT-NEXT: s_endpgm 4682 i32 %in, i32* %out) { 4683entry: 4684 store atomic i32 %in, i32* %out syncscope("singlethread-one-as") release, align 4 4685 ret void 4686} 4687 4688define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( 4689; GFX7-LABEL: flat_singlethread_one_as_seq_cst_store: 4690; GFX7: ; %bb.0: ; %entry 4691; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 4692; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 4693; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4694; GFX7-NEXT: v_mov_b32_e32 v2, s2 4695; GFX7-NEXT: v_mov_b32_e32 v0, s0 4696; GFX7-NEXT: v_mov_b32_e32 v1, s1 4697; GFX7-NEXT: flat_store_dword v[0:1], v2 4698; GFX7-NEXT: s_endpgm 4699; 4700; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_store: 4701; GFX10-WGP: ; %bb.0: ; %entry 4702; GFX10-WGP-NEXT: s_clause 0x1 4703; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 4704; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 4705; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4706; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4707; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 4708; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 4709; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 4710; GFX10-WGP-NEXT: s_endpgm 4711; 4712; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_store: 4713; GFX10-CU: ; %bb.0: ; %entry 4714; GFX10-CU-NEXT: s_clause 0x1 4715; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 4716; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 4717; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4718; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 4719; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 4720; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 4721; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 4722; GFX10-CU-NEXT: s_endpgm 4723; 4724; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_store: 4725; SKIP-CACHE-INV: ; %bb.0: ; %entry 4726; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 4727; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4728; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4729; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 4730; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 4731; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 4732; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 4733; SKIP-CACHE-INV-NEXT: s_endpgm 4734; 4735; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_store: 4736; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 4737; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 4738; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 4739; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4740; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 4741; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 4742; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 4743; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 4744; 4745; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_store: 4746; GFX90A-TGSPLIT: ; %bb.0: ; %entry 4747; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 4748; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 4749; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4750; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 4751; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 4752; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 4753; GFX90A-TGSPLIT-NEXT: s_endpgm 4754 i32 %in, i32* %out) { 4755entry: 4756 store atomic i32 %in, i32* %out syncscope("singlethread-one-as") seq_cst, align 4 4757 ret void 4758} 4759 4760define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw( 4761; GFX7-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: 4762; GFX7: ; %bb.0: ; %entry 4763; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4764; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 4765; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4766; GFX7-NEXT: v_mov_b32_e32 v0, s0 4767; GFX7-NEXT: v_mov_b32_e32 v1, s1 4768; GFX7-NEXT: v_mov_b32_e32 v2, s2 4769; GFX7-NEXT: flat_atomic_swap v[0:1], v2 4770; GFX7-NEXT: s_endpgm 4771; 4772; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: 4773; GFX10-WGP: ; %bb.0: ; %entry 4774; GFX10-WGP-NEXT: s_clause 0x1 4775; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4776; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 4777; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4778; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4779; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 4780; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 4781; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 4782; GFX10-WGP-NEXT: s_endpgm 4783; 4784; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: 4785; GFX10-CU: ; %bb.0: ; %entry 4786; GFX10-CU-NEXT: s_clause 0x1 4787; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4788; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 4789; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4790; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 4791; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 4792; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 4793; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 4794; GFX10-CU-NEXT: s_endpgm 4795; 4796; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: 4797; SKIP-CACHE-INV: ; %bb.0: ; %entry 4798; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 4799; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 4800; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4801; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 4802; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 4803; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 4804; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 4805; SKIP-CACHE-INV-NEXT: s_endpgm 4806; 4807; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: 4808; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 4809; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4810; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 4811; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4812; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 4813; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 4814; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 4815; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 4816; 4817; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: 4818; GFX90A-TGSPLIT: ; %bb.0: ; %entry 4819; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4820; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 4821; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4822; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 4823; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 4824; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 4825; GFX90A-TGSPLIT-NEXT: s_endpgm 4826 i32* %out, i32 %in) { 4827entry: 4828 %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") monotonic 4829 ret void 4830} 4831 4832define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw( 4833; GFX7-LABEL: flat_singlethread_one_as_acquire_atomicrmw: 4834; GFX7: ; %bb.0: ; %entry 4835; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4836; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 4837; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4838; GFX7-NEXT: v_mov_b32_e32 v0, s0 4839; GFX7-NEXT: v_mov_b32_e32 v1, s1 4840; GFX7-NEXT: v_mov_b32_e32 v2, s2 4841; GFX7-NEXT: flat_atomic_swap v[0:1], v2 4842; GFX7-NEXT: s_endpgm 4843; 4844; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_atomicrmw: 4845; GFX10-WGP: ; %bb.0: ; %entry 4846; GFX10-WGP-NEXT: s_clause 0x1 4847; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4848; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 4849; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4850; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4851; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 4852; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 4853; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 4854; GFX10-WGP-NEXT: s_endpgm 4855; 4856; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_atomicrmw: 4857; GFX10-CU: ; %bb.0: ; %entry 4858; GFX10-CU-NEXT: s_clause 0x1 4859; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4860; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 4861; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4862; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 4863; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 4864; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 4865; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 4866; GFX10-CU-NEXT: s_endpgm 4867; 4868; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_atomicrmw: 4869; SKIP-CACHE-INV: ; %bb.0: ; %entry 4870; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 4871; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 4872; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4873; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 4874; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 4875; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 4876; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 4877; SKIP-CACHE-INV-NEXT: s_endpgm 4878; 4879; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw: 4880; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 4881; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4882; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 4883; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4884; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 4885; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 4886; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 4887; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 4888; 4889; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw: 4890; GFX90A-TGSPLIT: ; %bb.0: ; %entry 4891; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4892; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 4893; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4894; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 4895; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 4896; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 4897; GFX90A-TGSPLIT-NEXT: s_endpgm 4898 i32* %out, i32 %in) { 4899entry: 4900 %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") acquire 4901 ret void 4902} 4903 4904define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw( 4905; GFX7-LABEL: flat_singlethread_one_as_release_atomicrmw: 4906; GFX7: ; %bb.0: ; %entry 4907; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4908; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 4909; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4910; GFX7-NEXT: v_mov_b32_e32 v0, s0 4911; GFX7-NEXT: v_mov_b32_e32 v1, s1 4912; GFX7-NEXT: v_mov_b32_e32 v2, s2 4913; GFX7-NEXT: flat_atomic_swap v[0:1], v2 4914; GFX7-NEXT: s_endpgm 4915; 4916; GFX10-WGP-LABEL: flat_singlethread_one_as_release_atomicrmw: 4917; GFX10-WGP: ; %bb.0: ; %entry 4918; GFX10-WGP-NEXT: s_clause 0x1 4919; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4920; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 4921; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4922; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4923; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 4924; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 4925; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 4926; GFX10-WGP-NEXT: s_endpgm 4927; 4928; GFX10-CU-LABEL: flat_singlethread_one_as_release_atomicrmw: 4929; GFX10-CU: ; %bb.0: ; %entry 4930; GFX10-CU-NEXT: s_clause 0x1 4931; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4932; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 4933; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4934; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 4935; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 4936; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 4937; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 4938; GFX10-CU-NEXT: s_endpgm 4939; 4940; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_atomicrmw: 4941; SKIP-CACHE-INV: ; %bb.0: ; %entry 4942; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 4943; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 4944; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4945; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 4946; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 4947; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 4948; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 4949; SKIP-CACHE-INV-NEXT: s_endpgm 4950; 4951; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_atomicrmw: 4952; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 4953; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4954; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 4955; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4956; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 4957; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 4958; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 4959; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 4960; 4961; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_atomicrmw: 4962; GFX90A-TGSPLIT: ; %bb.0: ; %entry 4963; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4964; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 4965; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4966; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 4967; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 4968; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 4969; GFX90A-TGSPLIT-NEXT: s_endpgm 4970 i32* %out, i32 %in) { 4971entry: 4972 %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") release 4973 ret void 4974} 4975 4976define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw( 4977; GFX7-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: 4978; GFX7: ; %bb.0: ; %entry 4979; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4980; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 4981; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4982; GFX7-NEXT: v_mov_b32_e32 v0, s0 4983; GFX7-NEXT: v_mov_b32_e32 v1, s1 4984; GFX7-NEXT: v_mov_b32_e32 v2, s2 4985; GFX7-NEXT: flat_atomic_swap v[0:1], v2 4986; GFX7-NEXT: s_endpgm 4987; 4988; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: 4989; GFX10-WGP: ; %bb.0: ; %entry 4990; GFX10-WGP-NEXT: s_clause 0x1 4991; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4992; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 4993; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4994; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4995; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 4996; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 4997; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 4998; GFX10-WGP-NEXT: s_endpgm 4999; 5000; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: 5001; GFX10-CU: ; %bb.0: ; %entry 5002; GFX10-CU-NEXT: s_clause 0x1 5003; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5004; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 5005; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5006; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 5007; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 5008; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 5009; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 5010; GFX10-CU-NEXT: s_endpgm 5011; 5012; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: 5013; SKIP-CACHE-INV: ; %bb.0: ; %entry 5014; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 5015; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 5016; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5017; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 5018; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 5019; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 5020; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 5021; SKIP-CACHE-INV-NEXT: s_endpgm 5022; 5023; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: 5024; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 5025; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5026; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 5027; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5028; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 5029; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 5030; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 5031; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 5032; 5033; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: 5034; GFX90A-TGSPLIT: ; %bb.0: ; %entry 5035; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5036; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 5037; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5038; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 5039; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 5040; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 5041; GFX90A-TGSPLIT-NEXT: s_endpgm 5042 i32* %out, i32 %in) { 5043entry: 5044 %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") acq_rel 5045 ret void 5046} 5047 5048define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw( 5049; GFX7-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: 5050; GFX7: ; %bb.0: ; %entry 5051; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5052; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 5053; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5054; GFX7-NEXT: v_mov_b32_e32 v0, s0 5055; GFX7-NEXT: v_mov_b32_e32 v1, s1 5056; GFX7-NEXT: v_mov_b32_e32 v2, s2 5057; GFX7-NEXT: flat_atomic_swap v[0:1], v2 5058; GFX7-NEXT: s_endpgm 5059; 5060; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: 5061; GFX10-WGP: ; %bb.0: ; %entry 5062; GFX10-WGP-NEXT: s_clause 0x1 5063; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5064; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 5065; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 5066; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 5067; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 5068; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 5069; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 5070; GFX10-WGP-NEXT: s_endpgm 5071; 5072; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: 5073; GFX10-CU: ; %bb.0: ; %entry 5074; GFX10-CU-NEXT: s_clause 0x1 5075; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5076; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 5077; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5078; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 5079; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 5080; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 5081; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 5082; GFX10-CU-NEXT: s_endpgm 5083; 5084; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: 5085; SKIP-CACHE-INV: ; %bb.0: ; %entry 5086; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 5087; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 5088; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5089; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 5090; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 5091; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 5092; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 5093; SKIP-CACHE-INV-NEXT: s_endpgm 5094; 5095; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: 5096; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 5097; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5098; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 5099; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5100; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 5101; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 5102; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 5103; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 5104; 5105; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: 5106; GFX90A-TGSPLIT: ; %bb.0: ; %entry 5107; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5108; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 5109; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5110; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 5111; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 5112; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 5113; GFX90A-TGSPLIT-NEXT: s_endpgm 5114 i32* %out, i32 %in) { 5115entry: 5116 %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") seq_cst 5117 ret void 5118} 5119 5120define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( 5121; GFX7-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: 5122; GFX7: ; %bb.0: ; %entry 5123; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5124; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 5125; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5126; GFX7-NEXT: v_mov_b32_e32 v0, s0 5127; GFX7-NEXT: v_mov_b32_e32 v1, s1 5128; GFX7-NEXT: v_mov_b32_e32 v2, s2 5129; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 5130; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5131; GFX7-NEXT: flat_store_dword v[0:1], v2 5132; GFX7-NEXT: s_endpgm 5133; 5134; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: 5135; GFX10-WGP: ; %bb.0: ; %entry 5136; GFX10-WGP-NEXT: s_clause 0x1 5137; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5138; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 5139; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 5140; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 5141; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 5142; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 5143; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 5144; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5145; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 5146; GFX10-WGP-NEXT: s_endpgm 5147; 5148; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: 5149; GFX10-CU: ; %bb.0: ; %entry 5150; GFX10-CU-NEXT: s_clause 0x1 5151; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5152; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 5153; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5154; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 5155; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 5156; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 5157; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 5158; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5159; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 5160; GFX10-CU-NEXT: s_endpgm 5161; 5162; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: 5163; SKIP-CACHE-INV: ; %bb.0: ; %entry 5164; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 5165; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 5166; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5167; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 5168; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 5169; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 5170; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 5171; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5172; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 5173; SKIP-CACHE-INV-NEXT: s_endpgm 5174; 5175; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: 5176; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 5177; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5178; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 5179; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5180; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 5181; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 5182; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 5183; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5184; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 5185; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 5186; 5187; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: 5188; GFX90A-TGSPLIT: ; %bb.0: ; %entry 5189; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5190; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 5191; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5192; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 5193; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 5194; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 5195; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 5196; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 5197; GFX90A-TGSPLIT-NEXT: s_endpgm 5198 i32* %out, i32 %in) { 5199entry: 5200 %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") acquire 5201 store i32 %val, i32* %out, align 4 5202 ret void 5203} 5204 5205define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( 5206; GFX7-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: 5207; GFX7: ; %bb.0: ; %entry 5208; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5209; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 5210; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5211; GFX7-NEXT: v_mov_b32_e32 v0, s0 5212; GFX7-NEXT: v_mov_b32_e32 v1, s1 5213; GFX7-NEXT: v_mov_b32_e32 v2, s2 5214; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 5215; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5216; GFX7-NEXT: flat_store_dword v[0:1], v2 5217; GFX7-NEXT: s_endpgm 5218; 5219; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: 5220; GFX10-WGP: ; %bb.0: ; %entry 5221; GFX10-WGP-NEXT: s_clause 0x1 5222; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5223; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 5224; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 5225; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 5226; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 5227; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 5228; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 5229; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5230; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 5231; GFX10-WGP-NEXT: s_endpgm 5232; 5233; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: 5234; GFX10-CU: ; %bb.0: ; %entry 5235; GFX10-CU-NEXT: s_clause 0x1 5236; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5237; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 5238; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5239; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 5240; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 5241; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 5242; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 5243; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5244; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 5245; GFX10-CU-NEXT: s_endpgm 5246; 5247; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: 5248; SKIP-CACHE-INV: ; %bb.0: ; %entry 5249; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 5250; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 5251; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5252; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 5253; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 5254; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 5255; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 5256; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5257; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 5258; SKIP-CACHE-INV-NEXT: s_endpgm 5259; 5260; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: 5261; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 5262; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5263; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 5264; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5265; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 5266; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 5267; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 5268; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5269; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 5270; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 5271; 5272; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: 5273; GFX90A-TGSPLIT: ; %bb.0: ; %entry 5274; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5275; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 5276; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5277; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 5278; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 5279; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 5280; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 5281; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 5282; GFX90A-TGSPLIT-NEXT: s_endpgm 5283 i32* %out, i32 %in) { 5284entry: 5285 %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") acq_rel 5286 store i32 %val, i32* %out, align 4 5287 ret void 5288} 5289 5290define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( 5291; GFX7-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: 5292; GFX7: ; %bb.0: ; %entry 5293; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5294; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 5295; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5296; GFX7-NEXT: v_mov_b32_e32 v0, s0 5297; GFX7-NEXT: v_mov_b32_e32 v1, s1 5298; GFX7-NEXT: v_mov_b32_e32 v2, s2 5299; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 5300; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5301; GFX7-NEXT: flat_store_dword v[0:1], v2 5302; GFX7-NEXT: s_endpgm 5303; 5304; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: 5305; GFX10-WGP: ; %bb.0: ; %entry 5306; GFX10-WGP-NEXT: s_clause 0x1 5307; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5308; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 5309; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 5310; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 5311; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 5312; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 5313; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 5314; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5315; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 5316; GFX10-WGP-NEXT: s_endpgm 5317; 5318; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: 5319; GFX10-CU: ; %bb.0: ; %entry 5320; GFX10-CU-NEXT: s_clause 0x1 5321; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5322; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 5323; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5324; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 5325; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 5326; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 5327; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 5328; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5329; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 5330; GFX10-CU-NEXT: s_endpgm 5331; 5332; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: 5333; SKIP-CACHE-INV: ; %bb.0: ; %entry 5334; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 5335; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 5336; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5337; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 5338; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 5339; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 5340; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 5341; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5342; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 5343; SKIP-CACHE-INV-NEXT: s_endpgm 5344; 5345; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: 5346; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 5347; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5348; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 5349; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5350; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 5351; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 5352; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 5353; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5354; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 5355; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 5356; 5357; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: 5358; GFX90A-TGSPLIT: ; %bb.0: ; %entry 5359; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5360; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 5361; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5362; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 5363; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 5364; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 5365; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 5366; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 5367; GFX90A-TGSPLIT-NEXT: s_endpgm 5368 i32* %out, i32 %in) { 5369entry: 5370 %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") seq_cst 5371 store i32 %val, i32* %out, align 4 5372 ret void 5373} 5374 5375define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg( 5376; GFX7-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: 5377; GFX7: ; %bb.0: ; %entry 5378; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5379; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 5380; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5381; GFX7-NEXT: s_add_u32 s0, s0, 16 5382; GFX7-NEXT: s_addc_u32 s1, s1, 0 5383; GFX7-NEXT: v_mov_b32_e32 v0, s0 5384; GFX7-NEXT: v_mov_b32_e32 v2, s2 5385; GFX7-NEXT: v_mov_b32_e32 v1, s1 5386; GFX7-NEXT: v_mov_b32_e32 v3, s3 5387; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 5388; GFX7-NEXT: s_endpgm 5389; 5390; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: 5391; GFX10-WGP: ; %bb.0: ; %entry 5392; GFX10-WGP-NEXT: s_clause 0x1 5393; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5394; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5395; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 5396; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 5397; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 5398; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 5399; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 5400; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 5401; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 5402; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 5403; GFX10-WGP-NEXT: s_endpgm 5404; 5405; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: 5406; GFX10-CU: ; %bb.0: ; %entry 5407; GFX10-CU-NEXT: s_clause 0x1 5408; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5409; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5410; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5411; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 5412; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 5413; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 5414; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 5415; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 5416; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 5417; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 5418; GFX10-CU-NEXT: s_endpgm 5419; 5420; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: 5421; SKIP-CACHE-INV: ; %bb.0: ; %entry 5422; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 5423; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 5424; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5425; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 5426; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 5427; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 5428; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 5429; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 5430; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 5431; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 5432; SKIP-CACHE-INV-NEXT: s_endpgm 5433; 5434; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: 5435; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 5436; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5437; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5438; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5439; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 5440; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 5441; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 5442; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 5443; 5444; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: 5445; GFX90A-TGSPLIT: ; %bb.0: ; %entry 5446; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5447; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5448; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5449; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 5450; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 5451; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 5452; GFX90A-TGSPLIT-NEXT: s_endpgm 5453 i32* %out, i32 %in, i32 %old) { 5454entry: 5455 %gep = getelementptr i32, i32* %out, i32 4 5456 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic monotonic 5457 ret void 5458} 5459 5460define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( 5461; GFX7-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: 5462; GFX7: ; %bb.0: ; %entry 5463; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5464; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 5465; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5466; GFX7-NEXT: s_add_u32 s0, s0, 16 5467; GFX7-NEXT: s_addc_u32 s1, s1, 0 5468; GFX7-NEXT: v_mov_b32_e32 v0, s0 5469; GFX7-NEXT: v_mov_b32_e32 v2, s2 5470; GFX7-NEXT: v_mov_b32_e32 v1, s1 5471; GFX7-NEXT: v_mov_b32_e32 v3, s3 5472; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 5473; GFX7-NEXT: s_endpgm 5474; 5475; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: 5476; GFX10-WGP: ; %bb.0: ; %entry 5477; GFX10-WGP-NEXT: s_clause 0x1 5478; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5479; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5480; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 5481; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 5482; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 5483; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 5484; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 5485; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 5486; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 5487; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 5488; GFX10-WGP-NEXT: s_endpgm 5489; 5490; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: 5491; GFX10-CU: ; %bb.0: ; %entry 5492; GFX10-CU-NEXT: s_clause 0x1 5493; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5494; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5495; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5496; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 5497; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 5498; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 5499; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 5500; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 5501; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 5502; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 5503; GFX10-CU-NEXT: s_endpgm 5504; 5505; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: 5506; SKIP-CACHE-INV: ; %bb.0: ; %entry 5507; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 5508; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 5509; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5510; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 5511; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 5512; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 5513; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 5514; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 5515; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 5516; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 5517; SKIP-CACHE-INV-NEXT: s_endpgm 5518; 5519; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: 5520; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 5521; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5522; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5523; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5524; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 5525; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 5526; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 5527; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 5528; 5529; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: 5530; GFX90A-TGSPLIT: ; %bb.0: ; %entry 5531; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5532; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5533; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5534; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 5535; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 5536; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 5537; GFX90A-TGSPLIT-NEXT: s_endpgm 5538 i32* %out, i32 %in, i32 %old) { 5539entry: 5540 %gep = getelementptr i32, i32* %out, i32 4 5541 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire monotonic 5542 ret void 5543} 5544 5545define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( 5546; GFX7-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: 5547; GFX7: ; %bb.0: ; %entry 5548; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5549; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 5550; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5551; GFX7-NEXT: s_add_u32 s0, s0, 16 5552; GFX7-NEXT: s_addc_u32 s1, s1, 0 5553; GFX7-NEXT: v_mov_b32_e32 v0, s0 5554; GFX7-NEXT: v_mov_b32_e32 v2, s2 5555; GFX7-NEXT: v_mov_b32_e32 v1, s1 5556; GFX7-NEXT: v_mov_b32_e32 v3, s3 5557; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 5558; GFX7-NEXT: s_endpgm 5559; 5560; GFX10-WGP-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: 5561; GFX10-WGP: ; %bb.0: ; %entry 5562; GFX10-WGP-NEXT: s_clause 0x1 5563; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5564; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5565; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 5566; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 5567; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 5568; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 5569; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 5570; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 5571; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 5572; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 5573; GFX10-WGP-NEXT: s_endpgm 5574; 5575; GFX10-CU-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: 5576; GFX10-CU: ; %bb.0: ; %entry 5577; GFX10-CU-NEXT: s_clause 0x1 5578; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5579; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5580; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5581; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 5582; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 5583; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 5584; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 5585; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 5586; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 5587; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 5588; GFX10-CU-NEXT: s_endpgm 5589; 5590; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: 5591; SKIP-CACHE-INV: ; %bb.0: ; %entry 5592; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 5593; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 5594; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5595; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 5596; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 5597; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 5598; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 5599; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 5600; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 5601; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 5602; SKIP-CACHE-INV-NEXT: s_endpgm 5603; 5604; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: 5605; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 5606; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5607; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5608; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5609; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 5610; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 5611; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 5612; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 5613; 5614; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: 5615; GFX90A-TGSPLIT: ; %bb.0: ; %entry 5616; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5617; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5618; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5619; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 5620; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 5621; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 5622; GFX90A-TGSPLIT-NEXT: s_endpgm 5623 i32* %out, i32 %in, i32 %old) { 5624entry: 5625 %gep = getelementptr i32, i32* %out, i32 4 5626 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release monotonic 5627 ret void 5628} 5629 5630define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( 5631; GFX7-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: 5632; GFX7: ; %bb.0: ; %entry 5633; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5634; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 5635; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5636; GFX7-NEXT: s_add_u32 s0, s0, 16 5637; GFX7-NEXT: s_addc_u32 s1, s1, 0 5638; GFX7-NEXT: v_mov_b32_e32 v0, s0 5639; GFX7-NEXT: v_mov_b32_e32 v2, s2 5640; GFX7-NEXT: v_mov_b32_e32 v1, s1 5641; GFX7-NEXT: v_mov_b32_e32 v3, s3 5642; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 5643; GFX7-NEXT: s_endpgm 5644; 5645; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: 5646; GFX10-WGP: ; %bb.0: ; %entry 5647; GFX10-WGP-NEXT: s_clause 0x1 5648; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5649; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5650; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 5651; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 5652; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 5653; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 5654; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 5655; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 5656; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 5657; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 5658; GFX10-WGP-NEXT: s_endpgm 5659; 5660; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: 5661; GFX10-CU: ; %bb.0: ; %entry 5662; GFX10-CU-NEXT: s_clause 0x1 5663; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5664; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5665; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5666; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 5667; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 5668; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 5669; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 5670; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 5671; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 5672; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 5673; GFX10-CU-NEXT: s_endpgm 5674; 5675; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: 5676; SKIP-CACHE-INV: ; %bb.0: ; %entry 5677; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 5678; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 5679; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5680; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 5681; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 5682; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 5683; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 5684; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 5685; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 5686; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 5687; SKIP-CACHE-INV-NEXT: s_endpgm 5688; 5689; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: 5690; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 5691; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5692; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5693; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5694; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 5695; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 5696; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 5697; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 5698; 5699; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: 5700; GFX90A-TGSPLIT: ; %bb.0: ; %entry 5701; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5702; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5703; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5704; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 5705; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 5706; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 5707; GFX90A-TGSPLIT-NEXT: s_endpgm 5708 i32* %out, i32 %in, i32 %old) { 5709entry: 5710 %gep = getelementptr i32, i32* %out, i32 4 5711 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel monotonic 5712 ret void 5713} 5714 5715define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( 5716; GFX7-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: 5717; GFX7: ; %bb.0: ; %entry 5718; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5719; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 5720; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5721; GFX7-NEXT: s_add_u32 s0, s0, 16 5722; GFX7-NEXT: s_addc_u32 s1, s1, 0 5723; GFX7-NEXT: v_mov_b32_e32 v0, s0 5724; GFX7-NEXT: v_mov_b32_e32 v2, s2 5725; GFX7-NEXT: v_mov_b32_e32 v1, s1 5726; GFX7-NEXT: v_mov_b32_e32 v3, s3 5727; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 5728; GFX7-NEXT: s_endpgm 5729; 5730; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: 5731; GFX10-WGP: ; %bb.0: ; %entry 5732; GFX10-WGP-NEXT: s_clause 0x1 5733; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5734; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5735; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 5736; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 5737; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 5738; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 5739; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 5740; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 5741; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 5742; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 5743; GFX10-WGP-NEXT: s_endpgm 5744; 5745; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: 5746; GFX10-CU: ; %bb.0: ; %entry 5747; GFX10-CU-NEXT: s_clause 0x1 5748; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5749; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5750; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5751; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 5752; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 5753; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 5754; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 5755; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 5756; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 5757; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 5758; GFX10-CU-NEXT: s_endpgm 5759; 5760; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: 5761; SKIP-CACHE-INV: ; %bb.0: ; %entry 5762; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 5763; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 5764; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5765; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 5766; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 5767; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 5768; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 5769; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 5770; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 5771; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 5772; SKIP-CACHE-INV-NEXT: s_endpgm 5773; 5774; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: 5775; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 5776; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5777; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5778; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5779; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 5780; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 5781; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 5782; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 5783; 5784; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: 5785; GFX90A-TGSPLIT: ; %bb.0: ; %entry 5786; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5787; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5788; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5789; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 5790; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 5791; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 5792; GFX90A-TGSPLIT-NEXT: s_endpgm 5793 i32* %out, i32 %in, i32 %old) { 5794entry: 5795 %gep = getelementptr i32, i32* %out, i32 4 5796 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst monotonic 5797 ret void 5798} 5799 5800define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( 5801; GFX7-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: 5802; GFX7: ; %bb.0: ; %entry 5803; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5804; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 5805; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5806; GFX7-NEXT: s_add_u32 s0, s0, 16 5807; GFX7-NEXT: s_addc_u32 s1, s1, 0 5808; GFX7-NEXT: v_mov_b32_e32 v0, s0 5809; GFX7-NEXT: v_mov_b32_e32 v2, s2 5810; GFX7-NEXT: v_mov_b32_e32 v1, s1 5811; GFX7-NEXT: v_mov_b32_e32 v3, s3 5812; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 5813; GFX7-NEXT: s_endpgm 5814; 5815; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: 5816; GFX10-WGP: ; %bb.0: ; %entry 5817; GFX10-WGP-NEXT: s_clause 0x1 5818; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5819; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5820; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 5821; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 5822; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 5823; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 5824; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 5825; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 5826; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 5827; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 5828; GFX10-WGP-NEXT: s_endpgm 5829; 5830; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: 5831; GFX10-CU: ; %bb.0: ; %entry 5832; GFX10-CU-NEXT: s_clause 0x1 5833; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5834; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5835; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5836; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 5837; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 5838; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 5839; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 5840; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 5841; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 5842; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 5843; GFX10-CU-NEXT: s_endpgm 5844; 5845; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: 5846; SKIP-CACHE-INV: ; %bb.0: ; %entry 5847; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 5848; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 5849; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5850; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 5851; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 5852; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 5853; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 5854; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 5855; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 5856; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 5857; SKIP-CACHE-INV-NEXT: s_endpgm 5858; 5859; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: 5860; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 5861; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5862; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5863; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5864; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 5865; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 5866; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 5867; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 5868; 5869; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: 5870; GFX90A-TGSPLIT: ; %bb.0: ; %entry 5871; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5872; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5873; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5874; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 5875; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 5876; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 5877; GFX90A-TGSPLIT-NEXT: s_endpgm 5878 i32* %out, i32 %in, i32 %old) { 5879entry: 5880 %gep = getelementptr i32, i32* %out, i32 4 5881 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic acquire 5882 ret void 5883} 5884 5885define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( 5886; GFX7-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: 5887; GFX7: ; %bb.0: ; %entry 5888; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5889; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 5890; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5891; GFX7-NEXT: s_add_u32 s0, s0, 16 5892; GFX7-NEXT: s_addc_u32 s1, s1, 0 5893; GFX7-NEXT: v_mov_b32_e32 v0, s0 5894; GFX7-NEXT: v_mov_b32_e32 v2, s2 5895; GFX7-NEXT: v_mov_b32_e32 v1, s1 5896; GFX7-NEXT: v_mov_b32_e32 v3, s3 5897; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 5898; GFX7-NEXT: s_endpgm 5899; 5900; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: 5901; GFX10-WGP: ; %bb.0: ; %entry 5902; GFX10-WGP-NEXT: s_clause 0x1 5903; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5904; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5905; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 5906; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 5907; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 5908; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 5909; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 5910; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 5911; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 5912; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 5913; GFX10-WGP-NEXT: s_endpgm 5914; 5915; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: 5916; GFX10-CU: ; %bb.0: ; %entry 5917; GFX10-CU-NEXT: s_clause 0x1 5918; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5919; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5920; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5921; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 5922; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 5923; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 5924; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 5925; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 5926; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 5927; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 5928; GFX10-CU-NEXT: s_endpgm 5929; 5930; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: 5931; SKIP-CACHE-INV: ; %bb.0: ; %entry 5932; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 5933; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 5934; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5935; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 5936; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 5937; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 5938; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 5939; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 5940; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 5941; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 5942; SKIP-CACHE-INV-NEXT: s_endpgm 5943; 5944; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: 5945; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 5946; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5947; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5948; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5949; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 5950; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 5951; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 5952; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 5953; 5954; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: 5955; GFX90A-TGSPLIT: ; %bb.0: ; %entry 5956; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5957; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5958; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5959; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 5960; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 5961; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 5962; GFX90A-TGSPLIT-NEXT: s_endpgm 5963 i32* %out, i32 %in, i32 %old) { 5964entry: 5965 %gep = getelementptr i32, i32* %out, i32 4 5966 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire acquire 5967 ret void 5968} 5969 5970define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( 5971; GFX7-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: 5972; GFX7: ; %bb.0: ; %entry 5973; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5974; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 5975; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5976; GFX7-NEXT: s_add_u32 s0, s0, 16 5977; GFX7-NEXT: s_addc_u32 s1, s1, 0 5978; GFX7-NEXT: v_mov_b32_e32 v0, s0 5979; GFX7-NEXT: v_mov_b32_e32 v2, s2 5980; GFX7-NEXT: v_mov_b32_e32 v1, s1 5981; GFX7-NEXT: v_mov_b32_e32 v3, s3 5982; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 5983; GFX7-NEXT: s_endpgm 5984; 5985; GFX10-WGP-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: 5986; GFX10-WGP: ; %bb.0: ; %entry 5987; GFX10-WGP-NEXT: s_clause 0x1 5988; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5989; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5990; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 5991; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 5992; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 5993; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 5994; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 5995; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 5996; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 5997; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 5998; GFX10-WGP-NEXT: s_endpgm 5999; 6000; GFX10-CU-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: 6001; GFX10-CU: ; %bb.0: ; %entry 6002; GFX10-CU-NEXT: s_clause 0x1 6003; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6004; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6005; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 6006; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 6007; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 6008; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 6009; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 6010; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 6011; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 6012; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 6013; GFX10-CU-NEXT: s_endpgm 6014; 6015; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: 6016; SKIP-CACHE-INV: ; %bb.0: ; %entry 6017; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 6018; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 6019; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 6020; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 6021; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 6022; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 6023; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 6024; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 6025; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 6026; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 6027; SKIP-CACHE-INV-NEXT: s_endpgm 6028; 6029; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: 6030; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 6031; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6032; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6033; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6034; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 6035; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 6036; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 6037; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 6038; 6039; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: 6040; GFX90A-TGSPLIT: ; %bb.0: ; %entry 6041; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6042; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6043; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6044; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 6045; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 6046; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 6047; GFX90A-TGSPLIT-NEXT: s_endpgm 6048 i32* %out, i32 %in, i32 %old) { 6049entry: 6050 %gep = getelementptr i32, i32* %out, i32 4 6051 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release acquire 6052 ret void 6053} 6054 6055define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( 6056; GFX7-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: 6057; GFX7: ; %bb.0: ; %entry 6058; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6059; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 6060; GFX7-NEXT: s_waitcnt lgkmcnt(0) 6061; GFX7-NEXT: s_add_u32 s0, s0, 16 6062; GFX7-NEXT: s_addc_u32 s1, s1, 0 6063; GFX7-NEXT: v_mov_b32_e32 v0, s0 6064; GFX7-NEXT: v_mov_b32_e32 v2, s2 6065; GFX7-NEXT: v_mov_b32_e32 v1, s1 6066; GFX7-NEXT: v_mov_b32_e32 v3, s3 6067; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 6068; GFX7-NEXT: s_endpgm 6069; 6070; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: 6071; GFX10-WGP: ; %bb.0: ; %entry 6072; GFX10-WGP-NEXT: s_clause 0x1 6073; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6074; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6075; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 6076; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 6077; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 6078; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 6079; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 6080; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 6081; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 6082; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 6083; GFX10-WGP-NEXT: s_endpgm 6084; 6085; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: 6086; GFX10-CU: ; %bb.0: ; %entry 6087; GFX10-CU-NEXT: s_clause 0x1 6088; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6089; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6090; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 6091; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 6092; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 6093; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 6094; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 6095; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 6096; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 6097; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 6098; GFX10-CU-NEXT: s_endpgm 6099; 6100; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: 6101; SKIP-CACHE-INV: ; %bb.0: ; %entry 6102; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 6103; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 6104; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 6105; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 6106; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 6107; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 6108; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 6109; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 6110; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 6111; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 6112; SKIP-CACHE-INV-NEXT: s_endpgm 6113; 6114; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: 6115; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 6116; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6117; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6118; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6119; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 6120; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 6121; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 6122; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 6123; 6124; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: 6125; GFX90A-TGSPLIT: ; %bb.0: ; %entry 6126; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6127; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6128; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6129; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 6130; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 6131; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 6132; GFX90A-TGSPLIT-NEXT: s_endpgm 6133 i32* %out, i32 %in, i32 %old) { 6134entry: 6135 %gep = getelementptr i32, i32* %out, i32 4 6136 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel acquire 6137 ret void 6138} 6139 6140define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( 6141; GFX7-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: 6142; GFX7: ; %bb.0: ; %entry 6143; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6144; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 6145; GFX7-NEXT: s_waitcnt lgkmcnt(0) 6146; GFX7-NEXT: s_add_u32 s0, s0, 16 6147; GFX7-NEXT: s_addc_u32 s1, s1, 0 6148; GFX7-NEXT: v_mov_b32_e32 v0, s0 6149; GFX7-NEXT: v_mov_b32_e32 v2, s2 6150; GFX7-NEXT: v_mov_b32_e32 v1, s1 6151; GFX7-NEXT: v_mov_b32_e32 v3, s3 6152; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 6153; GFX7-NEXT: s_endpgm 6154; 6155; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: 6156; GFX10-WGP: ; %bb.0: ; %entry 6157; GFX10-WGP-NEXT: s_clause 0x1 6158; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6159; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6160; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 6161; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 6162; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 6163; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 6164; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 6165; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 6166; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 6167; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 6168; GFX10-WGP-NEXT: s_endpgm 6169; 6170; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: 6171; GFX10-CU: ; %bb.0: ; %entry 6172; GFX10-CU-NEXT: s_clause 0x1 6173; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6174; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6175; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 6176; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 6177; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 6178; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 6179; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 6180; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 6181; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 6182; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 6183; GFX10-CU-NEXT: s_endpgm 6184; 6185; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: 6186; SKIP-CACHE-INV: ; %bb.0: ; %entry 6187; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 6188; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 6189; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 6190; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 6191; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 6192; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 6193; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 6194; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 6195; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 6196; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 6197; SKIP-CACHE-INV-NEXT: s_endpgm 6198; 6199; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: 6200; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 6201; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6202; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6203; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6204; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 6205; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 6206; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 6207; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 6208; 6209; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: 6210; GFX90A-TGSPLIT: ; %bb.0: ; %entry 6211; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6212; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6213; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6214; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 6215; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 6216; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 6217; GFX90A-TGSPLIT-NEXT: s_endpgm 6218 i32* %out, i32 %in, i32 %old) { 6219entry: 6220 %gep = getelementptr i32, i32* %out, i32 4 6221 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst acquire 6222 ret void 6223} 6224 6225define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( 6226; GFX7-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: 6227; GFX7: ; %bb.0: ; %entry 6228; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6229; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 6230; GFX7-NEXT: s_waitcnt lgkmcnt(0) 6231; GFX7-NEXT: s_add_u32 s0, s0, 16 6232; GFX7-NEXT: s_addc_u32 s1, s1, 0 6233; GFX7-NEXT: v_mov_b32_e32 v0, s0 6234; GFX7-NEXT: v_mov_b32_e32 v2, s2 6235; GFX7-NEXT: v_mov_b32_e32 v1, s1 6236; GFX7-NEXT: v_mov_b32_e32 v3, s3 6237; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 6238; GFX7-NEXT: s_endpgm 6239; 6240; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: 6241; GFX10-WGP: ; %bb.0: ; %entry 6242; GFX10-WGP-NEXT: s_clause 0x1 6243; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6244; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6245; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 6246; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 6247; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 6248; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 6249; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 6250; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 6251; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 6252; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 6253; GFX10-WGP-NEXT: s_endpgm 6254; 6255; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: 6256; GFX10-CU: ; %bb.0: ; %entry 6257; GFX10-CU-NEXT: s_clause 0x1 6258; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6259; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6260; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 6261; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 6262; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 6263; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 6264; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 6265; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 6266; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 6267; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 6268; GFX10-CU-NEXT: s_endpgm 6269; 6270; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: 6271; SKIP-CACHE-INV: ; %bb.0: ; %entry 6272; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 6273; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 6274; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 6275; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 6276; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 6277; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 6278; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 6279; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 6280; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 6281; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 6282; SKIP-CACHE-INV-NEXT: s_endpgm 6283; 6284; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: 6285; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 6286; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6287; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6288; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6289; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 6290; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 6291; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 6292; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 6293; 6294; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: 6295; GFX90A-TGSPLIT: ; %bb.0: ; %entry 6296; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6297; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6298; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6299; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 6300; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 6301; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 6302; GFX90A-TGSPLIT-NEXT: s_endpgm 6303 i32* %out, i32 %in, i32 %old) { 6304entry: 6305 %gep = getelementptr i32, i32* %out, i32 4 6306 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic seq_cst 6307 ret void 6308} 6309 6310define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( 6311; GFX7-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: 6312; GFX7: ; %bb.0: ; %entry 6313; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6314; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 6315; GFX7-NEXT: s_waitcnt lgkmcnt(0) 6316; GFX7-NEXT: s_add_u32 s0, s0, 16 6317; GFX7-NEXT: s_addc_u32 s1, s1, 0 6318; GFX7-NEXT: v_mov_b32_e32 v0, s0 6319; GFX7-NEXT: v_mov_b32_e32 v2, s2 6320; GFX7-NEXT: v_mov_b32_e32 v1, s1 6321; GFX7-NEXT: v_mov_b32_e32 v3, s3 6322; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 6323; GFX7-NEXT: s_endpgm 6324; 6325; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: 6326; GFX10-WGP: ; %bb.0: ; %entry 6327; GFX10-WGP-NEXT: s_clause 0x1 6328; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6329; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6330; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 6331; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 6332; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 6333; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 6334; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 6335; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 6336; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 6337; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 6338; GFX10-WGP-NEXT: s_endpgm 6339; 6340; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: 6341; GFX10-CU: ; %bb.0: ; %entry 6342; GFX10-CU-NEXT: s_clause 0x1 6343; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6344; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6345; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 6346; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 6347; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 6348; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 6349; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 6350; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 6351; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 6352; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 6353; GFX10-CU-NEXT: s_endpgm 6354; 6355; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: 6356; SKIP-CACHE-INV: ; %bb.0: ; %entry 6357; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 6358; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 6359; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 6360; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 6361; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 6362; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 6363; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 6364; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 6365; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 6366; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 6367; SKIP-CACHE-INV-NEXT: s_endpgm 6368; 6369; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: 6370; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 6371; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6372; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6373; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6374; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 6375; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 6376; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 6377; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 6378; 6379; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: 6380; GFX90A-TGSPLIT: ; %bb.0: ; %entry 6381; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6382; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6383; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6384; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 6385; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 6386; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 6387; GFX90A-TGSPLIT-NEXT: s_endpgm 6388 i32* %out, i32 %in, i32 %old) { 6389entry: 6390 %gep = getelementptr i32, i32* %out, i32 4 6391 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire seq_cst 6392 ret void 6393} 6394 6395define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( 6396; GFX7-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: 6397; GFX7: ; %bb.0: ; %entry 6398; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6399; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 6400; GFX7-NEXT: s_waitcnt lgkmcnt(0) 6401; GFX7-NEXT: s_add_u32 s0, s0, 16 6402; GFX7-NEXT: s_addc_u32 s1, s1, 0 6403; GFX7-NEXT: v_mov_b32_e32 v0, s0 6404; GFX7-NEXT: v_mov_b32_e32 v2, s2 6405; GFX7-NEXT: v_mov_b32_e32 v1, s1 6406; GFX7-NEXT: v_mov_b32_e32 v3, s3 6407; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 6408; GFX7-NEXT: s_endpgm 6409; 6410; GFX10-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: 6411; GFX10-WGP: ; %bb.0: ; %entry 6412; GFX10-WGP-NEXT: s_clause 0x1 6413; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6414; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6415; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 6416; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 6417; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 6418; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 6419; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 6420; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 6421; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 6422; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 6423; GFX10-WGP-NEXT: s_endpgm 6424; 6425; GFX10-CU-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: 6426; GFX10-CU: ; %bb.0: ; %entry 6427; GFX10-CU-NEXT: s_clause 0x1 6428; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6429; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6430; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 6431; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 6432; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 6433; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 6434; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 6435; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 6436; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 6437; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 6438; GFX10-CU-NEXT: s_endpgm 6439; 6440; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: 6441; SKIP-CACHE-INV: ; %bb.0: ; %entry 6442; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 6443; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 6444; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 6445; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 6446; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 6447; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 6448; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 6449; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 6450; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 6451; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 6452; SKIP-CACHE-INV-NEXT: s_endpgm 6453; 6454; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: 6455; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 6456; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6457; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6458; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6459; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 6460; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 6461; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 6462; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 6463; 6464; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: 6465; GFX90A-TGSPLIT: ; %bb.0: ; %entry 6466; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6467; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6468; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6469; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 6470; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 6471; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 6472; GFX90A-TGSPLIT-NEXT: s_endpgm 6473 i32* %out, i32 %in, i32 %old) { 6474entry: 6475 %gep = getelementptr i32, i32* %out, i32 4 6476 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release seq_cst 6477 ret void 6478} 6479 6480define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( 6481; GFX7-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: 6482; GFX7: ; %bb.0: ; %entry 6483; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6484; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 6485; GFX7-NEXT: s_waitcnt lgkmcnt(0) 6486; GFX7-NEXT: s_add_u32 s0, s0, 16 6487; GFX7-NEXT: s_addc_u32 s1, s1, 0 6488; GFX7-NEXT: v_mov_b32_e32 v0, s0 6489; GFX7-NEXT: v_mov_b32_e32 v2, s2 6490; GFX7-NEXT: v_mov_b32_e32 v1, s1 6491; GFX7-NEXT: v_mov_b32_e32 v3, s3 6492; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 6493; GFX7-NEXT: s_endpgm 6494; 6495; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: 6496; GFX10-WGP: ; %bb.0: ; %entry 6497; GFX10-WGP-NEXT: s_clause 0x1 6498; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6499; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6500; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 6501; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 6502; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 6503; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 6504; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 6505; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 6506; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 6507; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 6508; GFX10-WGP-NEXT: s_endpgm 6509; 6510; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: 6511; GFX10-CU: ; %bb.0: ; %entry 6512; GFX10-CU-NEXT: s_clause 0x1 6513; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6514; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6515; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 6516; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 6517; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 6518; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 6519; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 6520; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 6521; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 6522; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 6523; GFX10-CU-NEXT: s_endpgm 6524; 6525; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: 6526; SKIP-CACHE-INV: ; %bb.0: ; %entry 6527; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 6528; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 6529; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 6530; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 6531; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 6532; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 6533; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 6534; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 6535; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 6536; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 6537; SKIP-CACHE-INV-NEXT: s_endpgm 6538; 6539; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: 6540; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 6541; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6542; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6543; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6544; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 6545; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 6546; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 6547; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 6548; 6549; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: 6550; GFX90A-TGSPLIT: ; %bb.0: ; %entry 6551; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6552; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6553; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6554; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 6555; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 6556; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 6557; GFX90A-TGSPLIT-NEXT: s_endpgm 6558 i32* %out, i32 %in, i32 %old) { 6559entry: 6560 %gep = getelementptr i32, i32* %out, i32 4 6561 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel seq_cst 6562 ret void 6563} 6564 6565define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( 6566; GFX7-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: 6567; GFX7: ; %bb.0: ; %entry 6568; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6569; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 6570; GFX7-NEXT: s_waitcnt lgkmcnt(0) 6571; GFX7-NEXT: s_add_u32 s0, s0, 16 6572; GFX7-NEXT: s_addc_u32 s1, s1, 0 6573; GFX7-NEXT: v_mov_b32_e32 v0, s0 6574; GFX7-NEXT: v_mov_b32_e32 v2, s2 6575; GFX7-NEXT: v_mov_b32_e32 v1, s1 6576; GFX7-NEXT: v_mov_b32_e32 v3, s3 6577; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 6578; GFX7-NEXT: s_endpgm 6579; 6580; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: 6581; GFX10-WGP: ; %bb.0: ; %entry 6582; GFX10-WGP-NEXT: s_clause 0x1 6583; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6584; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6585; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 6586; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 6587; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 6588; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 6589; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 6590; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 6591; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 6592; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 6593; GFX10-WGP-NEXT: s_endpgm 6594; 6595; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: 6596; GFX10-CU: ; %bb.0: ; %entry 6597; GFX10-CU-NEXT: s_clause 0x1 6598; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6599; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6600; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 6601; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 6602; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 6603; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 6604; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 6605; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 6606; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 6607; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 6608; GFX10-CU-NEXT: s_endpgm 6609; 6610; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: 6611; SKIP-CACHE-INV: ; %bb.0: ; %entry 6612; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 6613; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 6614; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 6615; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 6616; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 6617; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 6618; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 6619; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 6620; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 6621; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 6622; SKIP-CACHE-INV-NEXT: s_endpgm 6623; 6624; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: 6625; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 6626; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6627; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6628; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6629; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 6630; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 6631; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 6632; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 6633; 6634; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: 6635; GFX90A-TGSPLIT: ; %bb.0: ; %entry 6636; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6637; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6638; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6639; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 6640; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 6641; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 6642; GFX90A-TGSPLIT-NEXT: s_endpgm 6643 i32* %out, i32 %in, i32 %old) { 6644entry: 6645 %gep = getelementptr i32, i32* %out, i32 4 6646 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst seq_cst 6647 ret void 6648} 6649 6650define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg( 6651; GFX7-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: 6652; GFX7: ; %bb.0: ; %entry 6653; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6654; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 6655; GFX7-NEXT: s_waitcnt lgkmcnt(0) 6656; GFX7-NEXT: s_add_u32 s4, s0, 16 6657; GFX7-NEXT: s_addc_u32 s5, s1, 0 6658; GFX7-NEXT: v_mov_b32_e32 v0, s4 6659; GFX7-NEXT: v_mov_b32_e32 v2, s2 6660; GFX7-NEXT: v_mov_b32_e32 v1, s5 6661; GFX7-NEXT: v_mov_b32_e32 v3, s3 6662; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 6663; GFX7-NEXT: v_mov_b32_e32 v0, s0 6664; GFX7-NEXT: v_mov_b32_e32 v1, s1 6665; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6666; GFX7-NEXT: flat_store_dword v[0:1], v2 6667; GFX7-NEXT: s_endpgm 6668; 6669; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: 6670; GFX10-WGP: ; %bb.0: ; %entry 6671; GFX10-WGP-NEXT: s_clause 0x1 6672; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6673; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6674; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 6675; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 6676; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 6677; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 6678; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 6679; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 6680; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 6681; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 6682; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 6683; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 6684; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6685; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 6686; GFX10-WGP-NEXT: s_endpgm 6687; 6688; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: 6689; GFX10-CU: ; %bb.0: ; %entry 6690; GFX10-CU-NEXT: s_clause 0x1 6691; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6692; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6693; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 6694; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 6695; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 6696; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 6697; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 6698; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 6699; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 6700; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 6701; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 6702; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 6703; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6704; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 6705; GFX10-CU-NEXT: s_endpgm 6706; 6707; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: 6708; SKIP-CACHE-INV: ; %bb.0: ; %entry 6709; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 6710; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 6711; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 6712; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 6713; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 6714; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 6715; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 6716; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 6717; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 6718; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 6719; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 6720; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 6721; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6722; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 6723; SKIP-CACHE-INV-NEXT: s_endpgm 6724; 6725; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: 6726; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 6727; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6728; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6729; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6730; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 6731; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 6732; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 6733; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6734; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 6735; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 6736; 6737; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: 6738; GFX90A-TGSPLIT: ; %bb.0: ; %entry 6739; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6740; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6741; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6742; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 6743; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 6744; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 6745; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 6746; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 6747; GFX90A-TGSPLIT-NEXT: s_endpgm 6748 i32* %out, i32 %in, i32 %old) { 6749entry: 6750 %gep = getelementptr i32, i32* %out, i32 4 6751 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic monotonic 6752 %val0 = extractvalue { i32, i1 } %val, 0 6753 store i32 %val0, i32* %out, align 4 6754 ret void 6755} 6756 6757define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg( 6758; GFX7-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: 6759; GFX7: ; %bb.0: ; %entry 6760; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6761; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 6762; GFX7-NEXT: s_waitcnt lgkmcnt(0) 6763; GFX7-NEXT: s_add_u32 s4, s0, 16 6764; GFX7-NEXT: s_addc_u32 s5, s1, 0 6765; GFX7-NEXT: v_mov_b32_e32 v0, s4 6766; GFX7-NEXT: v_mov_b32_e32 v2, s2 6767; GFX7-NEXT: v_mov_b32_e32 v1, s5 6768; GFX7-NEXT: v_mov_b32_e32 v3, s3 6769; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 6770; GFX7-NEXT: v_mov_b32_e32 v0, s0 6771; GFX7-NEXT: v_mov_b32_e32 v1, s1 6772; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6773; GFX7-NEXT: flat_store_dword v[0:1], v2 6774; GFX7-NEXT: s_endpgm 6775; 6776; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: 6777; GFX10-WGP: ; %bb.0: ; %entry 6778; GFX10-WGP-NEXT: s_clause 0x1 6779; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6780; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6781; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 6782; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 6783; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 6784; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 6785; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 6786; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 6787; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 6788; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 6789; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 6790; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 6791; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6792; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 6793; GFX10-WGP-NEXT: s_endpgm 6794; 6795; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: 6796; GFX10-CU: ; %bb.0: ; %entry 6797; GFX10-CU-NEXT: s_clause 0x1 6798; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6799; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6800; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 6801; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 6802; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 6803; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 6804; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 6805; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 6806; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 6807; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 6808; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 6809; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 6810; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6811; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 6812; GFX10-CU-NEXT: s_endpgm 6813; 6814; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: 6815; SKIP-CACHE-INV: ; %bb.0: ; %entry 6816; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 6817; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 6818; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 6819; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 6820; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 6821; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 6822; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 6823; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 6824; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 6825; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 6826; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 6827; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 6828; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6829; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 6830; SKIP-CACHE-INV-NEXT: s_endpgm 6831; 6832; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: 6833; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 6834; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6835; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6836; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6837; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 6838; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 6839; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 6840; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6841; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 6842; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 6843; 6844; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: 6845; GFX90A-TGSPLIT: ; %bb.0: ; %entry 6846; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6847; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6848; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6849; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 6850; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 6851; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 6852; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 6853; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 6854; GFX90A-TGSPLIT-NEXT: s_endpgm 6855 i32* %out, i32 %in, i32 %old) { 6856entry: 6857 %gep = getelementptr i32, i32* %out, i32 4 6858 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire monotonic 6859 %val0 = extractvalue { i32, i1 } %val, 0 6860 store i32 %val0, i32* %out, align 4 6861 ret void 6862} 6863 6864define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxchg( 6865; GFX7-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: 6866; GFX7: ; %bb.0: ; %entry 6867; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6868; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 6869; GFX7-NEXT: s_waitcnt lgkmcnt(0) 6870; GFX7-NEXT: s_add_u32 s4, s0, 16 6871; GFX7-NEXT: s_addc_u32 s5, s1, 0 6872; GFX7-NEXT: v_mov_b32_e32 v0, s4 6873; GFX7-NEXT: v_mov_b32_e32 v2, s2 6874; GFX7-NEXT: v_mov_b32_e32 v1, s5 6875; GFX7-NEXT: v_mov_b32_e32 v3, s3 6876; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 6877; GFX7-NEXT: v_mov_b32_e32 v0, s0 6878; GFX7-NEXT: v_mov_b32_e32 v1, s1 6879; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6880; GFX7-NEXT: flat_store_dword v[0:1], v2 6881; GFX7-NEXT: s_endpgm 6882; 6883; GFX10-WGP-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: 6884; GFX10-WGP: ; %bb.0: ; %entry 6885; GFX10-WGP-NEXT: s_clause 0x1 6886; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6887; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6888; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 6889; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 6890; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 6891; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 6892; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 6893; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 6894; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 6895; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 6896; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 6897; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 6898; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6899; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 6900; GFX10-WGP-NEXT: s_endpgm 6901; 6902; GFX10-CU-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: 6903; GFX10-CU: ; %bb.0: ; %entry 6904; GFX10-CU-NEXT: s_clause 0x1 6905; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6906; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6907; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 6908; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 6909; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 6910; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 6911; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 6912; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 6913; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 6914; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 6915; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 6916; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 6917; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6918; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 6919; GFX10-CU-NEXT: s_endpgm 6920; 6921; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: 6922; SKIP-CACHE-INV: ; %bb.0: ; %entry 6923; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 6924; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 6925; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 6926; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 6927; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 6928; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 6929; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 6930; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 6931; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 6932; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 6933; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 6934; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 6935; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6936; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 6937; SKIP-CACHE-INV-NEXT: s_endpgm 6938; 6939; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: 6940; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 6941; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6942; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6943; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6944; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 6945; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 6946; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 6947; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6948; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 6949; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 6950; 6951; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: 6952; GFX90A-TGSPLIT: ; %bb.0: ; %entry 6953; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6954; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6955; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6956; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 6957; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 6958; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 6959; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 6960; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 6961; GFX90A-TGSPLIT-NEXT: s_endpgm 6962 i32* %out, i32 %in, i32 %old) { 6963entry: 6964 %gep = getelementptr i32, i32* %out, i32 4 6965 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release monotonic 6966 %val0 = extractvalue { i32, i1 } %val, 0 6967 store i32 %val0, i32* %out, align 4 6968 ret void 6969} 6970 6971define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg( 6972; GFX7-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: 6973; GFX7: ; %bb.0: ; %entry 6974; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6975; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 6976; GFX7-NEXT: s_waitcnt lgkmcnt(0) 6977; GFX7-NEXT: s_add_u32 s4, s0, 16 6978; GFX7-NEXT: s_addc_u32 s5, s1, 0 6979; GFX7-NEXT: v_mov_b32_e32 v0, s4 6980; GFX7-NEXT: v_mov_b32_e32 v2, s2 6981; GFX7-NEXT: v_mov_b32_e32 v1, s5 6982; GFX7-NEXT: v_mov_b32_e32 v3, s3 6983; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 6984; GFX7-NEXT: v_mov_b32_e32 v0, s0 6985; GFX7-NEXT: v_mov_b32_e32 v1, s1 6986; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6987; GFX7-NEXT: flat_store_dword v[0:1], v2 6988; GFX7-NEXT: s_endpgm 6989; 6990; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: 6991; GFX10-WGP: ; %bb.0: ; %entry 6992; GFX10-WGP-NEXT: s_clause 0x1 6993; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6994; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6995; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 6996; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 6997; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 6998; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 6999; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 7000; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 7001; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 7002; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7003; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 7004; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 7005; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7006; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 7007; GFX10-WGP-NEXT: s_endpgm 7008; 7009; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: 7010; GFX10-CU: ; %bb.0: ; %entry 7011; GFX10-CU-NEXT: s_clause 0x1 7012; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7013; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 7014; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 7015; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 7016; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 7017; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 7018; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 7019; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 7020; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 7021; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7022; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 7023; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 7024; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7025; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 7026; GFX10-CU-NEXT: s_endpgm 7027; 7028; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: 7029; SKIP-CACHE-INV: ; %bb.0: ; %entry 7030; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 7031; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 7032; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 7033; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 7034; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 7035; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 7036; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 7037; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 7038; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 7039; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7040; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 7041; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 7042; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7043; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 7044; SKIP-CACHE-INV-NEXT: s_endpgm 7045; 7046; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: 7047; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 7048; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7049; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 7050; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7051; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 7052; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 7053; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 7054; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7055; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 7056; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 7057; 7058; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: 7059; GFX90A-TGSPLIT: ; %bb.0: ; %entry 7060; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7061; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 7062; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7063; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 7064; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 7065; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 7066; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 7067; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 7068; GFX90A-TGSPLIT-NEXT: s_endpgm 7069 i32* %out, i32 %in, i32 %old) { 7070entry: 7071 %gep = getelementptr i32, i32* %out, i32 4 7072 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel monotonic 7073 %val0 = extractvalue { i32, i1 } %val, 0 7074 store i32 %val0, i32* %out, align 4 7075 ret void 7076} 7077 7078define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg( 7079; GFX7-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: 7080; GFX7: ; %bb.0: ; %entry 7081; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7082; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 7083; GFX7-NEXT: s_waitcnt lgkmcnt(0) 7084; GFX7-NEXT: s_add_u32 s4, s0, 16 7085; GFX7-NEXT: s_addc_u32 s5, s1, 0 7086; GFX7-NEXT: v_mov_b32_e32 v0, s4 7087; GFX7-NEXT: v_mov_b32_e32 v2, s2 7088; GFX7-NEXT: v_mov_b32_e32 v1, s5 7089; GFX7-NEXT: v_mov_b32_e32 v3, s3 7090; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7091; GFX7-NEXT: v_mov_b32_e32 v0, s0 7092; GFX7-NEXT: v_mov_b32_e32 v1, s1 7093; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7094; GFX7-NEXT: flat_store_dword v[0:1], v2 7095; GFX7-NEXT: s_endpgm 7096; 7097; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: 7098; GFX10-WGP: ; %bb.0: ; %entry 7099; GFX10-WGP-NEXT: s_clause 0x1 7100; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7101; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 7102; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 7103; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 7104; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 7105; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 7106; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 7107; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 7108; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 7109; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7110; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 7111; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 7112; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7113; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 7114; GFX10-WGP-NEXT: s_endpgm 7115; 7116; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: 7117; GFX10-CU: ; %bb.0: ; %entry 7118; GFX10-CU-NEXT: s_clause 0x1 7119; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7120; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 7121; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 7122; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 7123; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 7124; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 7125; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 7126; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 7127; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 7128; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7129; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 7130; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 7131; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7132; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 7133; GFX10-CU-NEXT: s_endpgm 7134; 7135; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: 7136; SKIP-CACHE-INV: ; %bb.0: ; %entry 7137; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 7138; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 7139; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 7140; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 7141; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 7142; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 7143; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 7144; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 7145; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 7146; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7147; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 7148; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 7149; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7150; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 7151; SKIP-CACHE-INV-NEXT: s_endpgm 7152; 7153; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: 7154; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 7155; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7156; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 7157; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7158; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 7159; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 7160; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 7161; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7162; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 7163; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 7164; 7165; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: 7166; GFX90A-TGSPLIT: ; %bb.0: ; %entry 7167; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7168; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 7169; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7170; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 7171; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 7172; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 7173; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 7174; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 7175; GFX90A-TGSPLIT-NEXT: s_endpgm 7176 i32* %out, i32 %in, i32 %old) { 7177entry: 7178 %gep = getelementptr i32, i32* %out, i32 4 7179 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst monotonic 7180 %val0 = extractvalue { i32, i1 } %val, 0 7181 store i32 %val0, i32* %out, align 4 7182 ret void 7183} 7184 7185define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg( 7186; GFX7-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: 7187; GFX7: ; %bb.0: ; %entry 7188; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7189; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 7190; GFX7-NEXT: s_waitcnt lgkmcnt(0) 7191; GFX7-NEXT: s_add_u32 s4, s0, 16 7192; GFX7-NEXT: s_addc_u32 s5, s1, 0 7193; GFX7-NEXT: v_mov_b32_e32 v0, s4 7194; GFX7-NEXT: v_mov_b32_e32 v2, s2 7195; GFX7-NEXT: v_mov_b32_e32 v1, s5 7196; GFX7-NEXT: v_mov_b32_e32 v3, s3 7197; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7198; GFX7-NEXT: v_mov_b32_e32 v0, s0 7199; GFX7-NEXT: v_mov_b32_e32 v1, s1 7200; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7201; GFX7-NEXT: flat_store_dword v[0:1], v2 7202; GFX7-NEXT: s_endpgm 7203; 7204; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: 7205; GFX10-WGP: ; %bb.0: ; %entry 7206; GFX10-WGP-NEXT: s_clause 0x1 7207; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7208; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 7209; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 7210; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 7211; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 7212; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 7213; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 7214; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 7215; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 7216; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7217; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 7218; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 7219; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7220; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 7221; GFX10-WGP-NEXT: s_endpgm 7222; 7223; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: 7224; GFX10-CU: ; %bb.0: ; %entry 7225; GFX10-CU-NEXT: s_clause 0x1 7226; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7227; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 7228; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 7229; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 7230; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 7231; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 7232; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 7233; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 7234; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 7235; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7236; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 7237; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 7238; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7239; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 7240; GFX10-CU-NEXT: s_endpgm 7241; 7242; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: 7243; SKIP-CACHE-INV: ; %bb.0: ; %entry 7244; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 7245; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 7246; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 7247; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 7248; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 7249; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 7250; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 7251; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 7252; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 7253; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7254; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 7255; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 7256; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7257; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 7258; SKIP-CACHE-INV-NEXT: s_endpgm 7259; 7260; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: 7261; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 7262; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7263; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 7264; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7265; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 7266; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 7267; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 7268; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7269; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 7270; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 7271; 7272; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: 7273; GFX90A-TGSPLIT: ; %bb.0: ; %entry 7274; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7275; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 7276; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7277; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 7278; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 7279; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 7280; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 7281; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 7282; GFX90A-TGSPLIT-NEXT: s_endpgm 7283 i32* %out, i32 %in, i32 %old) { 7284entry: 7285 %gep = getelementptr i32, i32* %out, i32 4 7286 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic acquire 7287 %val0 = extractvalue { i32, i1 } %val, 0 7288 store i32 %val0, i32* %out, align 4 7289 ret void 7290} 7291 7292define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( 7293; GFX7-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: 7294; GFX7: ; %bb.0: ; %entry 7295; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7296; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 7297; GFX7-NEXT: s_waitcnt lgkmcnt(0) 7298; GFX7-NEXT: s_add_u32 s4, s0, 16 7299; GFX7-NEXT: s_addc_u32 s5, s1, 0 7300; GFX7-NEXT: v_mov_b32_e32 v0, s4 7301; GFX7-NEXT: v_mov_b32_e32 v2, s2 7302; GFX7-NEXT: v_mov_b32_e32 v1, s5 7303; GFX7-NEXT: v_mov_b32_e32 v3, s3 7304; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7305; GFX7-NEXT: v_mov_b32_e32 v0, s0 7306; GFX7-NEXT: v_mov_b32_e32 v1, s1 7307; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7308; GFX7-NEXT: flat_store_dword v[0:1], v2 7309; GFX7-NEXT: s_endpgm 7310; 7311; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: 7312; GFX10-WGP: ; %bb.0: ; %entry 7313; GFX10-WGP-NEXT: s_clause 0x1 7314; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7315; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 7316; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 7317; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 7318; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 7319; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 7320; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 7321; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 7322; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 7323; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7324; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 7325; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 7326; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7327; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 7328; GFX10-WGP-NEXT: s_endpgm 7329; 7330; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: 7331; GFX10-CU: ; %bb.0: ; %entry 7332; GFX10-CU-NEXT: s_clause 0x1 7333; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7334; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 7335; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 7336; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 7337; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 7338; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 7339; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 7340; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 7341; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 7342; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7343; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 7344; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 7345; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7346; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 7347; GFX10-CU-NEXT: s_endpgm 7348; 7349; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: 7350; SKIP-CACHE-INV: ; %bb.0: ; %entry 7351; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 7352; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 7353; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 7354; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 7355; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 7356; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 7357; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 7358; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 7359; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 7360; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7361; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 7362; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 7363; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7364; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 7365; SKIP-CACHE-INV-NEXT: s_endpgm 7366; 7367; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: 7368; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 7369; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7370; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 7371; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7372; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 7373; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 7374; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 7375; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7376; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 7377; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 7378; 7379; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: 7380; GFX90A-TGSPLIT: ; %bb.0: ; %entry 7381; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7382; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 7383; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7384; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 7385; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 7386; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 7387; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 7388; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 7389; GFX90A-TGSPLIT-NEXT: s_endpgm 7390 i32* %out, i32 %in, i32 %old) { 7391entry: 7392 %gep = getelementptr i32, i32* %out, i32 4 7393 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire acquire 7394 %val0 = extractvalue { i32, i1 } %val, 0 7395 store i32 %val0, i32* %out, align 4 7396 ret void 7397} 7398 7399define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( 7400; GFX7-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: 7401; GFX7: ; %bb.0: ; %entry 7402; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7403; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 7404; GFX7-NEXT: s_waitcnt lgkmcnt(0) 7405; GFX7-NEXT: s_add_u32 s4, s0, 16 7406; GFX7-NEXT: s_addc_u32 s5, s1, 0 7407; GFX7-NEXT: v_mov_b32_e32 v0, s4 7408; GFX7-NEXT: v_mov_b32_e32 v2, s2 7409; GFX7-NEXT: v_mov_b32_e32 v1, s5 7410; GFX7-NEXT: v_mov_b32_e32 v3, s3 7411; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7412; GFX7-NEXT: v_mov_b32_e32 v0, s0 7413; GFX7-NEXT: v_mov_b32_e32 v1, s1 7414; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7415; GFX7-NEXT: flat_store_dword v[0:1], v2 7416; GFX7-NEXT: s_endpgm 7417; 7418; GFX10-WGP-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: 7419; GFX10-WGP: ; %bb.0: ; %entry 7420; GFX10-WGP-NEXT: s_clause 0x1 7421; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7422; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 7423; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 7424; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 7425; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 7426; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 7427; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 7428; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 7429; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 7430; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7431; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 7432; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 7433; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7434; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 7435; GFX10-WGP-NEXT: s_endpgm 7436; 7437; GFX10-CU-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: 7438; GFX10-CU: ; %bb.0: ; %entry 7439; GFX10-CU-NEXT: s_clause 0x1 7440; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7441; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 7442; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 7443; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 7444; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 7445; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 7446; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 7447; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 7448; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 7449; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7450; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 7451; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 7452; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7453; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 7454; GFX10-CU-NEXT: s_endpgm 7455; 7456; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: 7457; SKIP-CACHE-INV: ; %bb.0: ; %entry 7458; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 7459; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 7460; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 7461; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 7462; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 7463; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 7464; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 7465; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 7466; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 7467; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7468; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 7469; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 7470; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7471; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 7472; SKIP-CACHE-INV-NEXT: s_endpgm 7473; 7474; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: 7475; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 7476; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7477; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 7478; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7479; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 7480; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 7481; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 7482; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7483; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 7484; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 7485; 7486; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: 7487; GFX90A-TGSPLIT: ; %bb.0: ; %entry 7488; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7489; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 7490; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7491; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 7492; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 7493; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 7494; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 7495; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 7496; GFX90A-TGSPLIT-NEXT: s_endpgm 7497 i32* %out, i32 %in, i32 %old) { 7498entry: 7499 %gep = getelementptr i32, i32* %out, i32 4 7500 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release acquire 7501 %val0 = extractvalue { i32, i1 } %val, 0 7502 store i32 %val0, i32* %out, align 4 7503 ret void 7504} 7505 7506define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( 7507; GFX7-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: 7508; GFX7: ; %bb.0: ; %entry 7509; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7510; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 7511; GFX7-NEXT: s_waitcnt lgkmcnt(0) 7512; GFX7-NEXT: s_add_u32 s4, s0, 16 7513; GFX7-NEXT: s_addc_u32 s5, s1, 0 7514; GFX7-NEXT: v_mov_b32_e32 v0, s4 7515; GFX7-NEXT: v_mov_b32_e32 v2, s2 7516; GFX7-NEXT: v_mov_b32_e32 v1, s5 7517; GFX7-NEXT: v_mov_b32_e32 v3, s3 7518; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7519; GFX7-NEXT: v_mov_b32_e32 v0, s0 7520; GFX7-NEXT: v_mov_b32_e32 v1, s1 7521; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7522; GFX7-NEXT: flat_store_dword v[0:1], v2 7523; GFX7-NEXT: s_endpgm 7524; 7525; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: 7526; GFX10-WGP: ; %bb.0: ; %entry 7527; GFX10-WGP-NEXT: s_clause 0x1 7528; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7529; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 7530; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 7531; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 7532; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 7533; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 7534; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 7535; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 7536; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 7537; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7538; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 7539; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 7540; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7541; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 7542; GFX10-WGP-NEXT: s_endpgm 7543; 7544; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: 7545; GFX10-CU: ; %bb.0: ; %entry 7546; GFX10-CU-NEXT: s_clause 0x1 7547; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7548; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 7549; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 7550; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 7551; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 7552; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 7553; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 7554; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 7555; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 7556; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7557; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 7558; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 7559; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7560; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 7561; GFX10-CU-NEXT: s_endpgm 7562; 7563; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: 7564; SKIP-CACHE-INV: ; %bb.0: ; %entry 7565; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 7566; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 7567; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 7568; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 7569; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 7570; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 7571; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 7572; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 7573; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 7574; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7575; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 7576; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 7577; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7578; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 7579; SKIP-CACHE-INV-NEXT: s_endpgm 7580; 7581; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: 7582; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 7583; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7584; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 7585; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7586; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 7587; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 7588; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 7589; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7590; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 7591; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 7592; 7593; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: 7594; GFX90A-TGSPLIT: ; %bb.0: ; %entry 7595; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7596; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 7597; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7598; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 7599; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 7600; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 7601; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 7602; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 7603; GFX90A-TGSPLIT-NEXT: s_endpgm 7604 i32* %out, i32 %in, i32 %old) { 7605entry: 7606 %gep = getelementptr i32, i32* %out, i32 4 7607 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel acquire 7608 %val0 = extractvalue { i32, i1 } %val, 0 7609 store i32 %val0, i32* %out, align 4 7610 ret void 7611} 7612 7613define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( 7614; GFX7-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: 7615; GFX7: ; %bb.0: ; %entry 7616; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7617; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 7618; GFX7-NEXT: s_waitcnt lgkmcnt(0) 7619; GFX7-NEXT: s_add_u32 s4, s0, 16 7620; GFX7-NEXT: s_addc_u32 s5, s1, 0 7621; GFX7-NEXT: v_mov_b32_e32 v0, s4 7622; GFX7-NEXT: v_mov_b32_e32 v2, s2 7623; GFX7-NEXT: v_mov_b32_e32 v1, s5 7624; GFX7-NEXT: v_mov_b32_e32 v3, s3 7625; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7626; GFX7-NEXT: v_mov_b32_e32 v0, s0 7627; GFX7-NEXT: v_mov_b32_e32 v1, s1 7628; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7629; GFX7-NEXT: flat_store_dword v[0:1], v2 7630; GFX7-NEXT: s_endpgm 7631; 7632; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: 7633; GFX10-WGP: ; %bb.0: ; %entry 7634; GFX10-WGP-NEXT: s_clause 0x1 7635; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7636; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 7637; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 7638; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 7639; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 7640; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 7641; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 7642; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 7643; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 7644; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7645; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 7646; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 7647; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7648; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 7649; GFX10-WGP-NEXT: s_endpgm 7650; 7651; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: 7652; GFX10-CU: ; %bb.0: ; %entry 7653; GFX10-CU-NEXT: s_clause 0x1 7654; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7655; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 7656; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 7657; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 7658; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 7659; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 7660; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 7661; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 7662; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 7663; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7664; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 7665; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 7666; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7667; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 7668; GFX10-CU-NEXT: s_endpgm 7669; 7670; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: 7671; SKIP-CACHE-INV: ; %bb.0: ; %entry 7672; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 7673; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 7674; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 7675; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 7676; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 7677; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 7678; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 7679; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 7680; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 7681; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7682; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 7683; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 7684; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7685; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 7686; SKIP-CACHE-INV-NEXT: s_endpgm 7687; 7688; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: 7689; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 7690; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7691; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 7692; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7693; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 7694; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 7695; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 7696; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7697; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 7698; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 7699; 7700; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: 7701; GFX90A-TGSPLIT: ; %bb.0: ; %entry 7702; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7703; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 7704; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7705; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 7706; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 7707; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 7708; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 7709; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 7710; GFX90A-TGSPLIT-NEXT: s_endpgm 7711 i32* %out, i32 %in, i32 %old) { 7712entry: 7713 %gep = getelementptr i32, i32* %out, i32 4 7714 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst acquire 7715 %val0 = extractvalue { i32, i1 } %val, 0 7716 store i32 %val0, i32* %out, align 4 7717 ret void 7718} 7719 7720define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg( 7721; GFX7-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: 7722; GFX7: ; %bb.0: ; %entry 7723; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7724; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 7725; GFX7-NEXT: s_waitcnt lgkmcnt(0) 7726; GFX7-NEXT: s_add_u32 s4, s0, 16 7727; GFX7-NEXT: s_addc_u32 s5, s1, 0 7728; GFX7-NEXT: v_mov_b32_e32 v0, s4 7729; GFX7-NEXT: v_mov_b32_e32 v2, s2 7730; GFX7-NEXT: v_mov_b32_e32 v1, s5 7731; GFX7-NEXT: v_mov_b32_e32 v3, s3 7732; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7733; GFX7-NEXT: v_mov_b32_e32 v0, s0 7734; GFX7-NEXT: v_mov_b32_e32 v1, s1 7735; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7736; GFX7-NEXT: flat_store_dword v[0:1], v2 7737; GFX7-NEXT: s_endpgm 7738; 7739; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: 7740; GFX10-WGP: ; %bb.0: ; %entry 7741; GFX10-WGP-NEXT: s_clause 0x1 7742; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7743; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 7744; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 7745; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 7746; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 7747; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 7748; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 7749; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 7750; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 7751; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7752; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 7753; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 7754; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7755; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 7756; GFX10-WGP-NEXT: s_endpgm 7757; 7758; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: 7759; GFX10-CU: ; %bb.0: ; %entry 7760; GFX10-CU-NEXT: s_clause 0x1 7761; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7762; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 7763; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 7764; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 7765; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 7766; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 7767; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 7768; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 7769; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 7770; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7771; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 7772; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 7773; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7774; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 7775; GFX10-CU-NEXT: s_endpgm 7776; 7777; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: 7778; SKIP-CACHE-INV: ; %bb.0: ; %entry 7779; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 7780; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 7781; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 7782; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 7783; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 7784; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 7785; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 7786; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 7787; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 7788; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7789; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 7790; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 7791; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7792; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 7793; SKIP-CACHE-INV-NEXT: s_endpgm 7794; 7795; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: 7796; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 7797; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7798; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 7799; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7800; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 7801; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 7802; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 7803; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7804; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 7805; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 7806; 7807; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: 7808; GFX90A-TGSPLIT: ; %bb.0: ; %entry 7809; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7810; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 7811; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7812; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 7813; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 7814; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 7815; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 7816; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 7817; GFX90A-TGSPLIT-NEXT: s_endpgm 7818 i32* %out, i32 %in, i32 %old) { 7819entry: 7820 %gep = getelementptr i32, i32* %out, i32 4 7821 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic seq_cst 7822 %val0 = extractvalue { i32, i1 } %val, 0 7823 store i32 %val0, i32* %out, align 4 7824 ret void 7825} 7826 7827define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( 7828; GFX7-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: 7829; GFX7: ; %bb.0: ; %entry 7830; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7831; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 7832; GFX7-NEXT: s_waitcnt lgkmcnt(0) 7833; GFX7-NEXT: s_add_u32 s4, s0, 16 7834; GFX7-NEXT: s_addc_u32 s5, s1, 0 7835; GFX7-NEXT: v_mov_b32_e32 v0, s4 7836; GFX7-NEXT: v_mov_b32_e32 v2, s2 7837; GFX7-NEXT: v_mov_b32_e32 v1, s5 7838; GFX7-NEXT: v_mov_b32_e32 v3, s3 7839; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7840; GFX7-NEXT: v_mov_b32_e32 v0, s0 7841; GFX7-NEXT: v_mov_b32_e32 v1, s1 7842; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7843; GFX7-NEXT: flat_store_dword v[0:1], v2 7844; GFX7-NEXT: s_endpgm 7845; 7846; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: 7847; GFX10-WGP: ; %bb.0: ; %entry 7848; GFX10-WGP-NEXT: s_clause 0x1 7849; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7850; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 7851; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 7852; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 7853; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 7854; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 7855; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 7856; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 7857; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 7858; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7859; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 7860; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 7861; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7862; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 7863; GFX10-WGP-NEXT: s_endpgm 7864; 7865; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: 7866; GFX10-CU: ; %bb.0: ; %entry 7867; GFX10-CU-NEXT: s_clause 0x1 7868; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7869; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 7870; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 7871; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 7872; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 7873; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 7874; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 7875; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 7876; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 7877; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7878; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 7879; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 7880; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7881; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 7882; GFX10-CU-NEXT: s_endpgm 7883; 7884; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: 7885; SKIP-CACHE-INV: ; %bb.0: ; %entry 7886; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 7887; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 7888; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 7889; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 7890; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 7891; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 7892; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 7893; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 7894; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 7895; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7896; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 7897; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 7898; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7899; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 7900; SKIP-CACHE-INV-NEXT: s_endpgm 7901; 7902; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: 7903; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 7904; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7905; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 7906; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7907; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 7908; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 7909; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 7910; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7911; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 7912; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 7913; 7914; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: 7915; GFX90A-TGSPLIT: ; %bb.0: ; %entry 7916; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7917; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 7918; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7919; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 7920; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 7921; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 7922; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 7923; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 7924; GFX90A-TGSPLIT-NEXT: s_endpgm 7925 i32* %out, i32 %in, i32 %old) { 7926entry: 7927 %gep = getelementptr i32, i32* %out, i32 4 7928 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire seq_cst 7929 %val0 = extractvalue { i32, i1 } %val, 0 7930 store i32 %val0, i32* %out, align 4 7931 ret void 7932} 7933 7934define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( 7935; GFX7-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: 7936; GFX7: ; %bb.0: ; %entry 7937; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7938; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 7939; GFX7-NEXT: s_waitcnt lgkmcnt(0) 7940; GFX7-NEXT: s_add_u32 s4, s0, 16 7941; GFX7-NEXT: s_addc_u32 s5, s1, 0 7942; GFX7-NEXT: v_mov_b32_e32 v0, s4 7943; GFX7-NEXT: v_mov_b32_e32 v2, s2 7944; GFX7-NEXT: v_mov_b32_e32 v1, s5 7945; GFX7-NEXT: v_mov_b32_e32 v3, s3 7946; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7947; GFX7-NEXT: v_mov_b32_e32 v0, s0 7948; GFX7-NEXT: v_mov_b32_e32 v1, s1 7949; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7950; GFX7-NEXT: flat_store_dword v[0:1], v2 7951; GFX7-NEXT: s_endpgm 7952; 7953; GFX10-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: 7954; GFX10-WGP: ; %bb.0: ; %entry 7955; GFX10-WGP-NEXT: s_clause 0x1 7956; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7957; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 7958; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 7959; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 7960; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 7961; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 7962; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 7963; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 7964; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 7965; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7966; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 7967; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 7968; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7969; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 7970; GFX10-WGP-NEXT: s_endpgm 7971; 7972; GFX10-CU-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: 7973; GFX10-CU: ; %bb.0: ; %entry 7974; GFX10-CU-NEXT: s_clause 0x1 7975; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7976; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 7977; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 7978; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 7979; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 7980; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 7981; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 7982; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 7983; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 7984; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7985; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 7986; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 7987; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7988; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 7989; GFX10-CU-NEXT: s_endpgm 7990; 7991; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: 7992; SKIP-CACHE-INV: ; %bb.0: ; %entry 7993; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 7994; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 7995; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 7996; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 7997; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 7998; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 7999; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 8000; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 8001; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 8002; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8003; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 8004; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 8005; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8006; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 8007; SKIP-CACHE-INV-NEXT: s_endpgm 8008; 8009; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: 8010; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 8011; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8012; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 8013; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8014; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 8015; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 8016; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 8017; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8018; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 8019; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 8020; 8021; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: 8022; GFX90A-TGSPLIT: ; %bb.0: ; %entry 8023; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8024; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 8025; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8026; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 8027; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 8028; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 8029; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 8030; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 8031; GFX90A-TGSPLIT-NEXT: s_endpgm 8032 i32* %out, i32 %in, i32 %old) { 8033entry: 8034 %gep = getelementptr i32, i32* %out, i32 4 8035 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release seq_cst 8036 %val0 = extractvalue { i32, i1 } %val, 0 8037 store i32 %val0, i32* %out, align 4 8038 ret void 8039} 8040 8041define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( 8042; GFX7-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: 8043; GFX7: ; %bb.0: ; %entry 8044; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8045; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 8046; GFX7-NEXT: s_waitcnt lgkmcnt(0) 8047; GFX7-NEXT: s_add_u32 s4, s0, 16 8048; GFX7-NEXT: s_addc_u32 s5, s1, 0 8049; GFX7-NEXT: v_mov_b32_e32 v0, s4 8050; GFX7-NEXT: v_mov_b32_e32 v2, s2 8051; GFX7-NEXT: v_mov_b32_e32 v1, s5 8052; GFX7-NEXT: v_mov_b32_e32 v3, s3 8053; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8054; GFX7-NEXT: v_mov_b32_e32 v0, s0 8055; GFX7-NEXT: v_mov_b32_e32 v1, s1 8056; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8057; GFX7-NEXT: flat_store_dword v[0:1], v2 8058; GFX7-NEXT: s_endpgm 8059; 8060; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: 8061; GFX10-WGP: ; %bb.0: ; %entry 8062; GFX10-WGP-NEXT: s_clause 0x1 8063; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8064; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 8065; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 8066; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 8067; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 8068; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 8069; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 8070; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 8071; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 8072; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8073; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 8074; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 8075; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8076; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 8077; GFX10-WGP-NEXT: s_endpgm 8078; 8079; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: 8080; GFX10-CU: ; %bb.0: ; %entry 8081; GFX10-CU-NEXT: s_clause 0x1 8082; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8083; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 8084; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 8085; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 8086; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 8087; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 8088; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 8089; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 8090; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 8091; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8092; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 8093; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 8094; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8095; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 8096; GFX10-CU-NEXT: s_endpgm 8097; 8098; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: 8099; SKIP-CACHE-INV: ; %bb.0: ; %entry 8100; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 8101; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 8102; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 8103; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 8104; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 8105; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 8106; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 8107; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 8108; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 8109; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8110; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 8111; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 8112; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8113; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 8114; SKIP-CACHE-INV-NEXT: s_endpgm 8115; 8116; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: 8117; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 8118; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8119; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 8120; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8121; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 8122; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 8123; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 8124; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8125; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 8126; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 8127; 8128; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: 8129; GFX90A-TGSPLIT: ; %bb.0: ; %entry 8130; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8131; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 8132; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8133; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 8134; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 8135; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 8136; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 8137; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 8138; GFX90A-TGSPLIT-NEXT: s_endpgm 8139 i32* %out, i32 %in, i32 %old) { 8140entry: 8141 %gep = getelementptr i32, i32* %out, i32 4 8142 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel seq_cst 8143 %val0 = extractvalue { i32, i1 } %val, 0 8144 store i32 %val0, i32* %out, align 4 8145 ret void 8146} 8147 8148define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( 8149; GFX7-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: 8150; GFX7: ; %bb.0: ; %entry 8151; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8152; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 8153; GFX7-NEXT: s_waitcnt lgkmcnt(0) 8154; GFX7-NEXT: s_add_u32 s4, s0, 16 8155; GFX7-NEXT: s_addc_u32 s5, s1, 0 8156; GFX7-NEXT: v_mov_b32_e32 v0, s4 8157; GFX7-NEXT: v_mov_b32_e32 v2, s2 8158; GFX7-NEXT: v_mov_b32_e32 v1, s5 8159; GFX7-NEXT: v_mov_b32_e32 v3, s3 8160; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8161; GFX7-NEXT: v_mov_b32_e32 v0, s0 8162; GFX7-NEXT: v_mov_b32_e32 v1, s1 8163; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8164; GFX7-NEXT: flat_store_dword v[0:1], v2 8165; GFX7-NEXT: s_endpgm 8166; 8167; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: 8168; GFX10-WGP: ; %bb.0: ; %entry 8169; GFX10-WGP-NEXT: s_clause 0x1 8170; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8171; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 8172; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 8173; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 8174; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 8175; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 8176; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 8177; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 8178; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 8179; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8180; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 8181; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 8182; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8183; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 8184; GFX10-WGP-NEXT: s_endpgm 8185; 8186; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: 8187; GFX10-CU: ; %bb.0: ; %entry 8188; GFX10-CU-NEXT: s_clause 0x1 8189; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8190; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 8191; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 8192; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 8193; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 8194; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 8195; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 8196; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 8197; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 8198; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8199; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 8200; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 8201; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8202; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 8203; GFX10-CU-NEXT: s_endpgm 8204; 8205; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: 8206; SKIP-CACHE-INV: ; %bb.0: ; %entry 8207; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 8208; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 8209; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 8210; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 8211; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 8212; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 8213; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 8214; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 8215; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 8216; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8217; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 8218; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 8219; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8220; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 8221; SKIP-CACHE-INV-NEXT: s_endpgm 8222; 8223; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: 8224; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 8225; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8226; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 8227; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8228; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 8229; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 8230; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 8231; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8232; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 8233; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 8234; 8235; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: 8236; GFX90A-TGSPLIT: ; %bb.0: ; %entry 8237; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8238; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 8239; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8240; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 8241; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 8242; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 8243; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 8244; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 8245; GFX90A-TGSPLIT-NEXT: s_endpgm 8246 i32* %out, i32 %in, i32 %old) { 8247entry: 8248 %gep = getelementptr i32, i32* %out, i32 4 8249 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst seq_cst 8250 %val0 = extractvalue { i32, i1 } %val, 0 8251 store i32 %val0, i32* %out, align 4 8252 ret void 8253} 8254 8255