1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s 3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s 4; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s 5; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s 6; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s 7; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s 8 9define amdgpu_kernel void @flat_wavefront_unordered_load( 10; GFX7-LABEL: flat_wavefront_unordered_load: 11; GFX7: ; %bb.0: ; %entry 12; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 13; GFX7-NEXT: s_waitcnt lgkmcnt(0) 14; GFX7-NEXT: v_mov_b32_e32 v0, s0 15; GFX7-NEXT: v_mov_b32_e32 v1, s1 16; GFX7-NEXT: flat_load_dword v0, v[0:1] 17; GFX7-NEXT: v_mov_b32_e32 v2, s2 18; GFX7-NEXT: v_mov_b32_e32 v3, s3 19; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 20; GFX7-NEXT: flat_store_dword v[2:3], v0 21; GFX7-NEXT: s_endpgm 22; 23; GFX10-WGP-LABEL: flat_wavefront_unordered_load: 24; GFX10-WGP: ; %bb.0: ; %entry 25; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 26; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 27; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 28; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 29; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] 30; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 31; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 32; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 33; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 34; GFX10-WGP-NEXT: s_endpgm 35; 36; GFX10-CU-LABEL: flat_wavefront_unordered_load: 37; GFX10-CU: ; %bb.0: ; %entry 38; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 39; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 40; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 41; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 42; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] 43; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 44; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 45; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 46; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 47; GFX10-CU-NEXT: s_endpgm 48; 49; SKIP-CACHE-INV-LABEL: flat_wavefront_unordered_load: 50; SKIP-CACHE-INV: ; %bb.0: ; %entry 51; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 52; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 53; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 54; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 55; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] 56; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 57; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 58; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 59; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 60; SKIP-CACHE-INV-NEXT: s_endpgm 61; 62; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_unordered_load: 63; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 64; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 65; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 66; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 67; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 68; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] 69; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 70; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 71; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 72; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 73; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 74; 75; GFX90A-TGSPLIT-LABEL: flat_wavefront_unordered_load: 76; GFX90A-TGSPLIT: ; %bb.0: ; %entry 77; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 78; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 79; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 80; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 81; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] 82; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 83; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 84; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 85; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 86; GFX90A-TGSPLIT-NEXT: s_endpgm 87; 88; 89 i32* %in, i32* %out) { 90entry: 91 %val = load atomic i32, i32* %in syncscope("wavefront") unordered, align 4 92 store i32 %val, i32* %out 93 ret void 94} 95 96define amdgpu_kernel void @flat_wavefront_monotonic_load( 97; GFX7-LABEL: flat_wavefront_monotonic_load: 98; GFX7: ; %bb.0: ; %entry 99; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 100; GFX7-NEXT: s_waitcnt lgkmcnt(0) 101; GFX7-NEXT: v_mov_b32_e32 v0, s0 102; GFX7-NEXT: v_mov_b32_e32 v1, s1 103; GFX7-NEXT: flat_load_dword v0, v[0:1] 104; GFX7-NEXT: v_mov_b32_e32 v2, s2 105; GFX7-NEXT: v_mov_b32_e32 v3, s3 106; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 107; GFX7-NEXT: flat_store_dword v[2:3], v0 108; GFX7-NEXT: s_endpgm 109; 110; GFX10-WGP-LABEL: flat_wavefront_monotonic_load: 111; GFX10-WGP: ; %bb.0: ; %entry 112; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 113; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 114; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 115; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 116; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] 117; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 118; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 119; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 120; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 121; GFX10-WGP-NEXT: s_endpgm 122; 123; GFX10-CU-LABEL: flat_wavefront_monotonic_load: 124; GFX10-CU: ; %bb.0: ; %entry 125; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 126; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 127; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 128; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 129; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] 130; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 131; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 132; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 133; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 134; GFX10-CU-NEXT: s_endpgm 135; 136; SKIP-CACHE-INV-LABEL: flat_wavefront_monotonic_load: 137; SKIP-CACHE-INV: ; %bb.0: ; %entry 138; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 139; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 140; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 141; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 142; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] 143; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 144; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 145; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 146; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 147; SKIP-CACHE-INV-NEXT: s_endpgm 148; 149; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_load: 150; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 151; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 152; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 153; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 154; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 155; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] 156; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 157; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 158; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 159; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 160; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 161; 162; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_load: 163; GFX90A-TGSPLIT: ; %bb.0: ; %entry 164; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 165; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 166; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 167; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 168; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] 169; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 170; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 171; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 172; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 173; GFX90A-TGSPLIT-NEXT: s_endpgm 174; 175; 176 i32* %in, i32* %out) { 177entry: 178 %val = load atomic i32, i32* %in syncscope("wavefront") monotonic, align 4 179 store i32 %val, i32* %out 180 ret void 181} 182 183define amdgpu_kernel void @flat_wavefront_acquire_load( 184; GFX7-LABEL: flat_wavefront_acquire_load: 185; GFX7: ; %bb.0: ; %entry 186; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 187; GFX7-NEXT: s_waitcnt lgkmcnt(0) 188; GFX7-NEXT: v_mov_b32_e32 v0, s0 189; GFX7-NEXT: v_mov_b32_e32 v1, s1 190; GFX7-NEXT: flat_load_dword v0, v[0:1] 191; GFX7-NEXT: v_mov_b32_e32 v2, s2 192; GFX7-NEXT: v_mov_b32_e32 v3, s3 193; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 194; GFX7-NEXT: flat_store_dword v[2:3], v0 195; GFX7-NEXT: s_endpgm 196; 197; GFX10-WGP-LABEL: flat_wavefront_acquire_load: 198; GFX10-WGP: ; %bb.0: ; %entry 199; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 200; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 201; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 202; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 203; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] 204; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 205; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 206; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 207; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 208; GFX10-WGP-NEXT: s_endpgm 209; 210; GFX10-CU-LABEL: flat_wavefront_acquire_load: 211; GFX10-CU: ; %bb.0: ; %entry 212; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 213; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 214; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 215; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 216; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] 217; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 218; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 219; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 220; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 221; GFX10-CU-NEXT: s_endpgm 222; 223; SKIP-CACHE-INV-LABEL: flat_wavefront_acquire_load: 224; SKIP-CACHE-INV: ; %bb.0: ; %entry 225; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 226; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 227; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 228; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 229; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] 230; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 231; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 232; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 233; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 234; SKIP-CACHE-INV-NEXT: s_endpgm 235; 236; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_load: 237; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 238; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 239; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 240; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 241; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 242; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] 243; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 244; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 245; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 246; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 247; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 248; 249; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_load: 250; GFX90A-TGSPLIT: ; %bb.0: ; %entry 251; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 252; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 253; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 254; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 255; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] 256; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 257; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 258; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 259; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 260; GFX90A-TGSPLIT-NEXT: s_endpgm 261; 262; 263 i32* %in, i32* %out) { 264entry: 265 %val = load atomic i32, i32* %in syncscope("wavefront") acquire, align 4 266 store i32 %val, i32* %out 267 ret void 268} 269 270define amdgpu_kernel void @flat_wavefront_seq_cst_load( 271; GFX7-LABEL: flat_wavefront_seq_cst_load: 272; GFX7: ; %bb.0: ; %entry 273; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 274; GFX7-NEXT: s_waitcnt lgkmcnt(0) 275; GFX7-NEXT: v_mov_b32_e32 v0, s0 276; GFX7-NEXT: v_mov_b32_e32 v1, s1 277; GFX7-NEXT: flat_load_dword v0, v[0:1] 278; GFX7-NEXT: v_mov_b32_e32 v2, s2 279; GFX7-NEXT: v_mov_b32_e32 v3, s3 280; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 281; GFX7-NEXT: flat_store_dword v[2:3], v0 282; GFX7-NEXT: s_endpgm 283; 284; GFX10-WGP-LABEL: flat_wavefront_seq_cst_load: 285; GFX10-WGP: ; %bb.0: ; %entry 286; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 287; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 288; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 289; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 290; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] 291; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 292; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 293; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 294; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 295; GFX10-WGP-NEXT: s_endpgm 296; 297; GFX10-CU-LABEL: flat_wavefront_seq_cst_load: 298; GFX10-CU: ; %bb.0: ; %entry 299; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 300; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 301; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 302; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 303; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] 304; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 305; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 306; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 307; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 308; GFX10-CU-NEXT: s_endpgm 309; 310; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_load: 311; SKIP-CACHE-INV: ; %bb.0: ; %entry 312; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 313; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 314; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 315; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 316; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] 317; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 318; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 319; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 320; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 321; SKIP-CACHE-INV-NEXT: s_endpgm 322; 323; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_load: 324; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 325; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 326; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 327; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 328; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 329; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] 330; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 331; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 332; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 333; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 334; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 335; 336; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_load: 337; GFX90A-TGSPLIT: ; %bb.0: ; %entry 338; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 339; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 340; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 341; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 342; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] 343; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 344; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 345; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 346; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 347; GFX90A-TGSPLIT-NEXT: s_endpgm 348; 349; 350 i32* %in, i32* %out) { 351entry: 352 %val = load atomic i32, i32* %in syncscope("wavefront") seq_cst, align 4 353 store i32 %val, i32* %out 354 ret void 355} 356 357define amdgpu_kernel void @flat_wavefront_unordered_store( 358; GFX7-LABEL: flat_wavefront_unordered_store: 359; GFX7: ; %bb.0: ; %entry 360; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 361; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 362; GFX7-NEXT: s_waitcnt lgkmcnt(0) 363; GFX7-NEXT: v_mov_b32_e32 v2, s2 364; GFX7-NEXT: v_mov_b32_e32 v0, s0 365; GFX7-NEXT: v_mov_b32_e32 v1, s1 366; GFX7-NEXT: flat_store_dword v[0:1], v2 367; GFX7-NEXT: s_endpgm 368; 369; GFX10-WGP-LABEL: flat_wavefront_unordered_store: 370; GFX10-WGP: ; %bb.0: ; %entry 371; GFX10-WGP-NEXT: s_clause 0x1 372; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 373; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 374; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 375; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 376; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 377; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 378; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 379; GFX10-WGP-NEXT: s_endpgm 380; 381; GFX10-CU-LABEL: flat_wavefront_unordered_store: 382; GFX10-CU: ; %bb.0: ; %entry 383; GFX10-CU-NEXT: s_clause 0x1 384; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 385; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 386; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 387; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 388; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 389; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 390; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 391; GFX10-CU-NEXT: s_endpgm 392; 393; SKIP-CACHE-INV-LABEL: flat_wavefront_unordered_store: 394; SKIP-CACHE-INV: ; %bb.0: ; %entry 395; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 396; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 397; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 398; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 399; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 400; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 401; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 402; SKIP-CACHE-INV-NEXT: s_endpgm 403; 404; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_unordered_store: 405; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 406; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 407; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 408; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 409; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 410; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 411; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 412; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 413; 414; GFX90A-TGSPLIT-LABEL: flat_wavefront_unordered_store: 415; GFX90A-TGSPLIT: ; %bb.0: ; %entry 416; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 417; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 418; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 419; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 420; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 421; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 422; GFX90A-TGSPLIT-NEXT: s_endpgm 423; 424; 425 i32 %in, i32* %out) { 426entry: 427 store atomic i32 %in, i32* %out syncscope("wavefront") unordered, align 4 428 ret void 429} 430 431define amdgpu_kernel void @flat_wavefront_monotonic_store( 432; GFX7-LABEL: flat_wavefront_monotonic_store: 433; GFX7: ; %bb.0: ; %entry 434; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 435; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 436; GFX7-NEXT: s_waitcnt lgkmcnt(0) 437; GFX7-NEXT: v_mov_b32_e32 v2, s2 438; GFX7-NEXT: v_mov_b32_e32 v0, s0 439; GFX7-NEXT: v_mov_b32_e32 v1, s1 440; GFX7-NEXT: flat_store_dword v[0:1], v2 441; GFX7-NEXT: s_endpgm 442; 443; GFX10-WGP-LABEL: flat_wavefront_monotonic_store: 444; GFX10-WGP: ; %bb.0: ; %entry 445; GFX10-WGP-NEXT: s_clause 0x1 446; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 447; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 448; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 449; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 450; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 451; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 452; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 453; GFX10-WGP-NEXT: s_endpgm 454; 455; GFX10-CU-LABEL: flat_wavefront_monotonic_store: 456; GFX10-CU: ; %bb.0: ; %entry 457; GFX10-CU-NEXT: s_clause 0x1 458; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 459; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 460; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 461; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 462; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 463; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 464; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 465; GFX10-CU-NEXT: s_endpgm 466; 467; SKIP-CACHE-INV-LABEL: flat_wavefront_monotonic_store: 468; SKIP-CACHE-INV: ; %bb.0: ; %entry 469; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 470; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 471; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 472; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 473; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 474; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 475; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 476; SKIP-CACHE-INV-NEXT: s_endpgm 477; 478; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_store: 479; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 480; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 481; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 482; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 483; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 484; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 485; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 486; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 487; 488; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_store: 489; GFX90A-TGSPLIT: ; %bb.0: ; %entry 490; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 491; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 492; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 493; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 494; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 495; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 496; GFX90A-TGSPLIT-NEXT: s_endpgm 497; 498; 499 i32 %in, i32* %out) { 500entry: 501 store atomic i32 %in, i32* %out syncscope("wavefront") monotonic, align 4 502 ret void 503} 504 505define amdgpu_kernel void @flat_wavefront_release_store( 506; GFX7-LABEL: flat_wavefront_release_store: 507; GFX7: ; %bb.0: ; %entry 508; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 509; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 510; GFX7-NEXT: s_waitcnt lgkmcnt(0) 511; GFX7-NEXT: v_mov_b32_e32 v2, s2 512; GFX7-NEXT: v_mov_b32_e32 v0, s0 513; GFX7-NEXT: v_mov_b32_e32 v1, s1 514; GFX7-NEXT: flat_store_dword v[0:1], v2 515; GFX7-NEXT: s_endpgm 516; 517; GFX10-WGP-LABEL: flat_wavefront_release_store: 518; GFX10-WGP: ; %bb.0: ; %entry 519; GFX10-WGP-NEXT: s_clause 0x1 520; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 521; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 522; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 523; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 524; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 525; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 526; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 527; GFX10-WGP-NEXT: s_endpgm 528; 529; GFX10-CU-LABEL: flat_wavefront_release_store: 530; GFX10-CU: ; %bb.0: ; %entry 531; GFX10-CU-NEXT: s_clause 0x1 532; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 533; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 534; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 535; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 536; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 537; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 538; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 539; GFX10-CU-NEXT: s_endpgm 540; 541; SKIP-CACHE-INV-LABEL: flat_wavefront_release_store: 542; SKIP-CACHE-INV: ; %bb.0: ; %entry 543; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 544; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 545; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 546; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 547; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 548; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 549; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 550; SKIP-CACHE-INV-NEXT: s_endpgm 551; 552; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_store: 553; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 554; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 555; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 556; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 557; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 558; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 559; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 560; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 561; 562; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_store: 563; GFX90A-TGSPLIT: ; %bb.0: ; %entry 564; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 565; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 566; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 567; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 568; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 569; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 570; GFX90A-TGSPLIT-NEXT: s_endpgm 571; 572; 573 i32 %in, i32* %out) { 574entry: 575 store atomic i32 %in, i32* %out syncscope("wavefront") release, align 4 576 ret void 577} 578 579define amdgpu_kernel void @flat_wavefront_seq_cst_store( 580; GFX7-LABEL: flat_wavefront_seq_cst_store: 581; GFX7: ; %bb.0: ; %entry 582; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 583; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 584; GFX7-NEXT: s_waitcnt lgkmcnt(0) 585; GFX7-NEXT: v_mov_b32_e32 v2, s2 586; GFX7-NEXT: v_mov_b32_e32 v0, s0 587; GFX7-NEXT: v_mov_b32_e32 v1, s1 588; GFX7-NEXT: flat_store_dword v[0:1], v2 589; GFX7-NEXT: s_endpgm 590; 591; GFX10-WGP-LABEL: flat_wavefront_seq_cst_store: 592; GFX10-WGP: ; %bb.0: ; %entry 593; GFX10-WGP-NEXT: s_clause 0x1 594; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 595; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 596; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 597; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 598; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 599; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 600; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 601; GFX10-WGP-NEXT: s_endpgm 602; 603; GFX10-CU-LABEL: flat_wavefront_seq_cst_store: 604; GFX10-CU: ; %bb.0: ; %entry 605; GFX10-CU-NEXT: s_clause 0x1 606; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 607; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 608; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 609; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 610; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 611; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 612; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 613; GFX10-CU-NEXT: s_endpgm 614; 615; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_store: 616; SKIP-CACHE-INV: ; %bb.0: ; %entry 617; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 618; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 619; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 620; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 621; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 622; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 623; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 624; SKIP-CACHE-INV-NEXT: s_endpgm 625; 626; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_store: 627; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 628; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 629; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 630; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 631; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 632; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 633; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 634; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 635; 636; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_store: 637; GFX90A-TGSPLIT: ; %bb.0: ; %entry 638; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 639; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 640; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 641; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 642; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 643; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 644; GFX90A-TGSPLIT-NEXT: s_endpgm 645; 646; 647 i32 %in, i32* %out) { 648entry: 649 store atomic i32 %in, i32* %out syncscope("wavefront") seq_cst, align 4 650 ret void 651} 652 653define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw( 654; GFX7-LABEL: flat_wavefront_monotonic_atomicrmw: 655; GFX7: ; %bb.0: ; %entry 656; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 657; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 658; GFX7-NEXT: s_waitcnt lgkmcnt(0) 659; GFX7-NEXT: v_mov_b32_e32 v0, s0 660; GFX7-NEXT: v_mov_b32_e32 v1, s1 661; GFX7-NEXT: v_mov_b32_e32 v2, s2 662; GFX7-NEXT: flat_atomic_swap v[0:1], v2 663; GFX7-NEXT: s_endpgm 664; 665; GFX10-WGP-LABEL: flat_wavefront_monotonic_atomicrmw: 666; GFX10-WGP: ; %bb.0: ; %entry 667; GFX10-WGP-NEXT: s_clause 0x1 668; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 669; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 670; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 671; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 672; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 673; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 674; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 675; GFX10-WGP-NEXT: s_endpgm 676; 677; GFX10-CU-LABEL: flat_wavefront_monotonic_atomicrmw: 678; GFX10-CU: ; %bb.0: ; %entry 679; GFX10-CU-NEXT: s_clause 0x1 680; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 681; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 682; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 683; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 684; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 685; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 686; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 687; GFX10-CU-NEXT: s_endpgm 688; 689; SKIP-CACHE-INV-LABEL: flat_wavefront_monotonic_atomicrmw: 690; SKIP-CACHE-INV: ; %bb.0: ; %entry 691; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 692; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 693; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 694; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 695; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 696; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 697; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 698; SKIP-CACHE-INV-NEXT: s_endpgm 699; 700; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_atomicrmw: 701; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 702; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 703; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 704; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 705; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 706; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 707; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 708; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 709; 710; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_atomicrmw: 711; GFX90A-TGSPLIT: ; %bb.0: ; %entry 712; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 713; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 714; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 715; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 716; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 717; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 718; GFX90A-TGSPLIT-NEXT: s_endpgm 719; 720; 721 i32* %out, i32 %in) { 722entry: 723 %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") monotonic 724 ret void 725} 726 727define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw( 728; GFX7-LABEL: flat_wavefront_acquire_atomicrmw: 729; GFX7: ; %bb.0: ; %entry 730; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 731; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 732; GFX7-NEXT: s_waitcnt lgkmcnt(0) 733; GFX7-NEXT: v_mov_b32_e32 v0, s0 734; GFX7-NEXT: v_mov_b32_e32 v1, s1 735; GFX7-NEXT: v_mov_b32_e32 v2, s2 736; GFX7-NEXT: flat_atomic_swap v[0:1], v2 737; GFX7-NEXT: s_endpgm 738; 739; GFX10-WGP-LABEL: flat_wavefront_acquire_atomicrmw: 740; GFX10-WGP: ; %bb.0: ; %entry 741; GFX10-WGP-NEXT: s_clause 0x1 742; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 743; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 744; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 745; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 746; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 747; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 748; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 749; GFX10-WGP-NEXT: s_endpgm 750; 751; GFX10-CU-LABEL: flat_wavefront_acquire_atomicrmw: 752; GFX10-CU: ; %bb.0: ; %entry 753; GFX10-CU-NEXT: s_clause 0x1 754; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 755; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 756; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 757; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 758; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 759; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 760; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 761; GFX10-CU-NEXT: s_endpgm 762; 763; SKIP-CACHE-INV-LABEL: flat_wavefront_acquire_atomicrmw: 764; SKIP-CACHE-INV: ; %bb.0: ; %entry 765; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 766; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 767; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 768; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 769; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 770; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 771; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 772; SKIP-CACHE-INV-NEXT: s_endpgm 773; 774; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_atomicrmw: 775; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 776; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 777; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 778; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 779; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 780; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 781; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 782; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 783; 784; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_atomicrmw: 785; GFX90A-TGSPLIT: ; %bb.0: ; %entry 786; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 787; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 788; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 789; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 790; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 791; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 792; GFX90A-TGSPLIT-NEXT: s_endpgm 793; 794; 795 i32* %out, i32 %in) { 796entry: 797 %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") acquire 798 ret void 799} 800 801define amdgpu_kernel void @flat_wavefront_release_atomicrmw( 802; GFX7-LABEL: flat_wavefront_release_atomicrmw: 803; GFX7: ; %bb.0: ; %entry 804; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 805; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 806; GFX7-NEXT: s_waitcnt lgkmcnt(0) 807; GFX7-NEXT: v_mov_b32_e32 v0, s0 808; GFX7-NEXT: v_mov_b32_e32 v1, s1 809; GFX7-NEXT: v_mov_b32_e32 v2, s2 810; GFX7-NEXT: flat_atomic_swap v[0:1], v2 811; GFX7-NEXT: s_endpgm 812; 813; GFX10-WGP-LABEL: flat_wavefront_release_atomicrmw: 814; GFX10-WGP: ; %bb.0: ; %entry 815; GFX10-WGP-NEXT: s_clause 0x1 816; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 817; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 818; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 819; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 820; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 821; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 822; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 823; GFX10-WGP-NEXT: s_endpgm 824; 825; GFX10-CU-LABEL: flat_wavefront_release_atomicrmw: 826; GFX10-CU: ; %bb.0: ; %entry 827; GFX10-CU-NEXT: s_clause 0x1 828; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 829; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 830; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 831; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 832; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 833; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 834; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 835; GFX10-CU-NEXT: s_endpgm 836; 837; SKIP-CACHE-INV-LABEL: flat_wavefront_release_atomicrmw: 838; SKIP-CACHE-INV: ; %bb.0: ; %entry 839; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 840; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 841; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 842; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 843; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 844; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 845; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 846; SKIP-CACHE-INV-NEXT: s_endpgm 847; 848; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_atomicrmw: 849; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 850; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 851; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 852; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 853; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 854; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 855; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 856; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 857; 858; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_atomicrmw: 859; GFX90A-TGSPLIT: ; %bb.0: ; %entry 860; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 861; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 862; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 863; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 864; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 865; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 866; GFX90A-TGSPLIT-NEXT: s_endpgm 867; 868; 869 i32* %out, i32 %in) { 870entry: 871 %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") release 872 ret void 873} 874 875define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw( 876; GFX7-LABEL: flat_wavefront_acq_rel_atomicrmw: 877; GFX7: ; %bb.0: ; %entry 878; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 879; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 880; GFX7-NEXT: s_waitcnt lgkmcnt(0) 881; GFX7-NEXT: v_mov_b32_e32 v0, s0 882; GFX7-NEXT: v_mov_b32_e32 v1, s1 883; GFX7-NEXT: v_mov_b32_e32 v2, s2 884; GFX7-NEXT: flat_atomic_swap v[0:1], v2 885; GFX7-NEXT: s_endpgm 886; 887; GFX10-WGP-LABEL: flat_wavefront_acq_rel_atomicrmw: 888; GFX10-WGP: ; %bb.0: ; %entry 889; GFX10-WGP-NEXT: s_clause 0x1 890; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 891; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 892; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 893; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 894; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 895; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 896; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 897; GFX10-WGP-NEXT: s_endpgm 898; 899; GFX10-CU-LABEL: flat_wavefront_acq_rel_atomicrmw: 900; GFX10-CU: ; %bb.0: ; %entry 901; GFX10-CU-NEXT: s_clause 0x1 902; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 903; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 904; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 905; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 906; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 907; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 908; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 909; GFX10-CU-NEXT: s_endpgm 910; 911; SKIP-CACHE-INV-LABEL: flat_wavefront_acq_rel_atomicrmw: 912; SKIP-CACHE-INV: ; %bb.0: ; %entry 913; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 914; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 915; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 916; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 917; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 918; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 919; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 920; SKIP-CACHE-INV-NEXT: s_endpgm 921; 922; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_atomicrmw: 923; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 924; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 925; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 926; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 927; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 928; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 929; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 930; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 931; 932; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_atomicrmw: 933; GFX90A-TGSPLIT: ; %bb.0: ; %entry 934; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 935; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 936; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 937; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 938; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 939; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 940; GFX90A-TGSPLIT-NEXT: s_endpgm 941; 942; 943 i32* %out, i32 %in) { 944entry: 945 %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") acq_rel 946 ret void 947} 948 949define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw( 950; GFX7-LABEL: flat_wavefront_seq_cst_atomicrmw: 951; GFX7: ; %bb.0: ; %entry 952; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 953; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 954; GFX7-NEXT: s_waitcnt lgkmcnt(0) 955; GFX7-NEXT: v_mov_b32_e32 v0, s0 956; GFX7-NEXT: v_mov_b32_e32 v1, s1 957; GFX7-NEXT: v_mov_b32_e32 v2, s2 958; GFX7-NEXT: flat_atomic_swap v[0:1], v2 959; GFX7-NEXT: s_endpgm 960; 961; GFX10-WGP-LABEL: flat_wavefront_seq_cst_atomicrmw: 962; GFX10-WGP: ; %bb.0: ; %entry 963; GFX10-WGP-NEXT: s_clause 0x1 964; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 965; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 966; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 967; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 968; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 969; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 970; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 971; GFX10-WGP-NEXT: s_endpgm 972; 973; GFX10-CU-LABEL: flat_wavefront_seq_cst_atomicrmw: 974; GFX10-CU: ; %bb.0: ; %entry 975; GFX10-CU-NEXT: s_clause 0x1 976; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 977; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 978; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 979; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 980; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 981; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 982; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 983; GFX10-CU-NEXT: s_endpgm 984; 985; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_atomicrmw: 986; SKIP-CACHE-INV: ; %bb.0: ; %entry 987; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 988; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 989; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 990; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 991; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 992; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 993; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 994; SKIP-CACHE-INV-NEXT: s_endpgm 995; 996; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_atomicrmw: 997; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 998; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 999; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 1000; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1001; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1002; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 1003; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 1004; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 1005; 1006; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_atomicrmw: 1007; GFX90A-TGSPLIT: ; %bb.0: ; %entry 1008; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1009; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 1010; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1011; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1012; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 1013; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 1014; GFX90A-TGSPLIT-NEXT: s_endpgm 1015; 1016; 1017 i32* %out, i32 %in) { 1018entry: 1019 %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") seq_cst 1020 ret void 1021} 1022 1023define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( 1024; GFX7-LABEL: flat_wavefront_acquire_ret_atomicrmw: 1025; GFX7: ; %bb.0: ; %entry 1026; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1027; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 1028; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1029; GFX7-NEXT: v_mov_b32_e32 v0, s0 1030; GFX7-NEXT: v_mov_b32_e32 v1, s1 1031; GFX7-NEXT: v_mov_b32_e32 v2, s2 1032; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1033; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1034; GFX7-NEXT: flat_store_dword v[0:1], v2 1035; GFX7-NEXT: s_endpgm 1036; 1037; GFX10-WGP-LABEL: flat_wavefront_acquire_ret_atomicrmw: 1038; GFX10-WGP: ; %bb.0: ; %entry 1039; GFX10-WGP-NEXT: s_clause 0x1 1040; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1041; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 1042; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1043; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1044; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 1045; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 1046; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1047; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1048; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 1049; GFX10-WGP-NEXT: s_endpgm 1050; 1051; GFX10-CU-LABEL: flat_wavefront_acquire_ret_atomicrmw: 1052; GFX10-CU: ; %bb.0: ; %entry 1053; GFX10-CU-NEXT: s_clause 0x1 1054; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1055; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 1056; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1057; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1058; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 1059; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 1060; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1061; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1062; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 1063; GFX10-CU-NEXT: s_endpgm 1064; 1065; SKIP-CACHE-INV-LABEL: flat_wavefront_acquire_ret_atomicrmw: 1066; SKIP-CACHE-INV: ; %bb.0: ; %entry 1067; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 1068; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 1069; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1070; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1071; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 1072; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 1073; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1074; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1075; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 1076; SKIP-CACHE-INV-NEXT: s_endpgm 1077; 1078; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_ret_atomicrmw: 1079; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 1080; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1081; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 1082; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1083; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1084; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 1085; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1086; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1087; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 1088; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 1089; 1090; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_ret_atomicrmw: 1091; GFX90A-TGSPLIT: ; %bb.0: ; %entry 1092; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1093; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 1094; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1095; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1096; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 1097; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1098; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 1099; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 1100; GFX90A-TGSPLIT-NEXT: s_endpgm 1101; 1102; 1103 i32* %out, i32 %in) { 1104entry: 1105 %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") acquire 1106 store i32 %val, i32* %out, align 4 1107 ret void 1108} 1109 1110define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( 1111; GFX7-LABEL: flat_wavefront_acq_rel_ret_atomicrmw: 1112; GFX7: ; %bb.0: ; %entry 1113; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1114; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 1115; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1116; GFX7-NEXT: v_mov_b32_e32 v0, s0 1117; GFX7-NEXT: v_mov_b32_e32 v1, s1 1118; GFX7-NEXT: v_mov_b32_e32 v2, s2 1119; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1120; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1121; GFX7-NEXT: flat_store_dword v[0:1], v2 1122; GFX7-NEXT: s_endpgm 1123; 1124; GFX10-WGP-LABEL: flat_wavefront_acq_rel_ret_atomicrmw: 1125; GFX10-WGP: ; %bb.0: ; %entry 1126; GFX10-WGP-NEXT: s_clause 0x1 1127; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1128; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 1129; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1130; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1131; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 1132; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 1133; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1134; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1135; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 1136; GFX10-WGP-NEXT: s_endpgm 1137; 1138; GFX10-CU-LABEL: flat_wavefront_acq_rel_ret_atomicrmw: 1139; GFX10-CU: ; %bb.0: ; %entry 1140; GFX10-CU-NEXT: s_clause 0x1 1141; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1142; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 1143; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1144; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1145; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 1146; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 1147; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1148; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1149; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 1150; GFX10-CU-NEXT: s_endpgm 1151; 1152; SKIP-CACHE-INV-LABEL: flat_wavefront_acq_rel_ret_atomicrmw: 1153; SKIP-CACHE-INV: ; %bb.0: ; %entry 1154; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 1155; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 1156; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1157; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1158; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 1159; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 1160; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1161; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1162; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 1163; SKIP-CACHE-INV-NEXT: s_endpgm 1164; 1165; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_ret_atomicrmw: 1166; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 1167; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1168; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 1169; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1170; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1171; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 1172; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1173; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1174; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 1175; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 1176; 1177; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_ret_atomicrmw: 1178; GFX90A-TGSPLIT: ; %bb.0: ; %entry 1179; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1180; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 1181; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1182; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1183; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 1184; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1185; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 1186; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 1187; GFX90A-TGSPLIT-NEXT: s_endpgm 1188; 1189; 1190 i32* %out, i32 %in) { 1191entry: 1192 %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") acq_rel 1193 store i32 %val, i32* %out, align 4 1194 ret void 1195} 1196 1197define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( 1198; GFX7-LABEL: flat_wavefront_seq_cst_ret_atomicrmw: 1199; GFX7: ; %bb.0: ; %entry 1200; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1201; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 1202; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1203; GFX7-NEXT: v_mov_b32_e32 v0, s0 1204; GFX7-NEXT: v_mov_b32_e32 v1, s1 1205; GFX7-NEXT: v_mov_b32_e32 v2, s2 1206; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1207; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1208; GFX7-NEXT: flat_store_dword v[0:1], v2 1209; GFX7-NEXT: s_endpgm 1210; 1211; GFX10-WGP-LABEL: flat_wavefront_seq_cst_ret_atomicrmw: 1212; GFX10-WGP: ; %bb.0: ; %entry 1213; GFX10-WGP-NEXT: s_clause 0x1 1214; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1215; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 1216; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1217; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1218; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 1219; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 1220; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1221; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1222; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 1223; GFX10-WGP-NEXT: s_endpgm 1224; 1225; GFX10-CU-LABEL: flat_wavefront_seq_cst_ret_atomicrmw: 1226; GFX10-CU: ; %bb.0: ; %entry 1227; GFX10-CU-NEXT: s_clause 0x1 1228; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1229; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 1230; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1231; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1232; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 1233; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 1234; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1235; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1236; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 1237; GFX10-CU-NEXT: s_endpgm 1238; 1239; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_ret_atomicrmw: 1240; SKIP-CACHE-INV: ; %bb.0: ; %entry 1241; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 1242; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 1243; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1244; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1245; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 1246; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 1247; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1248; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1249; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 1250; SKIP-CACHE-INV-NEXT: s_endpgm 1251; 1252; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_ret_atomicrmw: 1253; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 1254; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1255; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 1256; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1257; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1258; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 1259; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1260; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1261; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 1262; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 1263; 1264; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_ret_atomicrmw: 1265; GFX90A-TGSPLIT: ; %bb.0: ; %entry 1266; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1267; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 1268; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1269; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1270; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 1271; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1272; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 1273; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 1274; GFX90A-TGSPLIT-NEXT: s_endpgm 1275; 1276; 1277 i32* %out, i32 %in) { 1278entry: 1279 %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") seq_cst 1280 store i32 %val, i32* %out, align 4 1281 ret void 1282} 1283 1284define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg( 1285; GFX7-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: 1286; GFX7: ; %bb.0: ; %entry 1287; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1288; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 1289; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1290; GFX7-NEXT: s_add_u32 s0, s0, 16 1291; GFX7-NEXT: s_addc_u32 s1, s1, 0 1292; GFX7-NEXT: v_mov_b32_e32 v0, s0 1293; GFX7-NEXT: v_mov_b32_e32 v2, s2 1294; GFX7-NEXT: v_mov_b32_e32 v1, s1 1295; GFX7-NEXT: v_mov_b32_e32 v3, s3 1296; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1297; GFX7-NEXT: s_endpgm 1298; 1299; GFX10-WGP-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: 1300; GFX10-WGP: ; %bb.0: ; %entry 1301; GFX10-WGP-NEXT: s_clause 0x1 1302; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1303; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1304; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1305; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 1306; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 1307; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1308; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 1309; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 1310; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 1311; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1312; GFX10-WGP-NEXT: s_endpgm 1313; 1314; GFX10-CU-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: 1315; GFX10-CU: ; %bb.0: ; %entry 1316; GFX10-CU-NEXT: s_clause 0x1 1317; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1318; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1319; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1320; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 1321; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 1322; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1323; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 1324; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 1325; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 1326; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1327; GFX10-CU-NEXT: s_endpgm 1328; 1329; SKIP-CACHE-INV-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: 1330; SKIP-CACHE-INV: ; %bb.0: ; %entry 1331; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 1332; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1333; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1334; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 1335; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 1336; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1337; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 1338; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 1339; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 1340; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1341; SKIP-CACHE-INV-NEXT: s_endpgm 1342; 1343; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: 1344; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 1345; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1346; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1347; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1348; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1349; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 1350; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 1351; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 1352; 1353; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: 1354; GFX90A-TGSPLIT: ; %bb.0: ; %entry 1355; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1356; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1357; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1358; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1359; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 1360; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 1361; GFX90A-TGSPLIT-NEXT: s_endpgm 1362; 1363; 1364 i32* %out, i32 %in, i32 %old) { 1365entry: 1366 %gep = getelementptr i32, i32* %out, i32 4 1367 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") monotonic monotonic 1368 ret void 1369} 1370 1371define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( 1372; GFX7-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: 1373; GFX7: ; %bb.0: ; %entry 1374; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1375; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 1376; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1377; GFX7-NEXT: s_add_u32 s0, s0, 16 1378; GFX7-NEXT: s_addc_u32 s1, s1, 0 1379; GFX7-NEXT: v_mov_b32_e32 v0, s0 1380; GFX7-NEXT: v_mov_b32_e32 v2, s2 1381; GFX7-NEXT: v_mov_b32_e32 v1, s1 1382; GFX7-NEXT: v_mov_b32_e32 v3, s3 1383; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1384; GFX7-NEXT: s_endpgm 1385; 1386; GFX10-WGP-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: 1387; GFX10-WGP: ; %bb.0: ; %entry 1388; GFX10-WGP-NEXT: s_clause 0x1 1389; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1390; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1391; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1392; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 1393; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 1394; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1395; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 1396; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 1397; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 1398; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1399; GFX10-WGP-NEXT: s_endpgm 1400; 1401; GFX10-CU-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: 1402; GFX10-CU: ; %bb.0: ; %entry 1403; GFX10-CU-NEXT: s_clause 0x1 1404; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1405; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1406; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1407; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 1408; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 1409; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1410; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 1411; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 1412; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 1413; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1414; GFX10-CU-NEXT: s_endpgm 1415; 1416; SKIP-CACHE-INV-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: 1417; SKIP-CACHE-INV: ; %bb.0: ; %entry 1418; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 1419; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1420; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1421; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 1422; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 1423; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1424; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 1425; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 1426; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 1427; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1428; SKIP-CACHE-INV-NEXT: s_endpgm 1429; 1430; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: 1431; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 1432; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1433; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1434; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1435; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1436; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 1437; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 1438; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 1439; 1440; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: 1441; GFX90A-TGSPLIT: ; %bb.0: ; %entry 1442; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1443; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1444; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1445; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1446; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 1447; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 1448; GFX90A-TGSPLIT-NEXT: s_endpgm 1449; 1450; 1451 i32* %out, i32 %in, i32 %old) { 1452entry: 1453 %gep = getelementptr i32, i32* %out, i32 4 1454 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") acquire monotonic 1455 ret void 1456} 1457 1458define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( 1459; GFX7-LABEL: flat_wavefront_release_monotonic_cmpxchg: 1460; GFX7: ; %bb.0: ; %entry 1461; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1462; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 1463; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1464; GFX7-NEXT: s_add_u32 s0, s0, 16 1465; GFX7-NEXT: s_addc_u32 s1, s1, 0 1466; GFX7-NEXT: v_mov_b32_e32 v0, s0 1467; GFX7-NEXT: v_mov_b32_e32 v2, s2 1468; GFX7-NEXT: v_mov_b32_e32 v1, s1 1469; GFX7-NEXT: v_mov_b32_e32 v3, s3 1470; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1471; GFX7-NEXT: s_endpgm 1472; 1473; GFX10-WGP-LABEL: flat_wavefront_release_monotonic_cmpxchg: 1474; GFX10-WGP: ; %bb.0: ; %entry 1475; GFX10-WGP-NEXT: s_clause 0x1 1476; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1477; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1478; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1479; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 1480; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 1481; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1482; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 1483; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 1484; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 1485; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1486; GFX10-WGP-NEXT: s_endpgm 1487; 1488; GFX10-CU-LABEL: flat_wavefront_release_monotonic_cmpxchg: 1489; GFX10-CU: ; %bb.0: ; %entry 1490; GFX10-CU-NEXT: s_clause 0x1 1491; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1492; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1493; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1494; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 1495; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 1496; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1497; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 1498; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 1499; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 1500; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1501; GFX10-CU-NEXT: s_endpgm 1502; 1503; SKIP-CACHE-INV-LABEL: flat_wavefront_release_monotonic_cmpxchg: 1504; SKIP-CACHE-INV: ; %bb.0: ; %entry 1505; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 1506; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1507; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1508; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 1509; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 1510; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1511; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 1512; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 1513; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 1514; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1515; SKIP-CACHE-INV-NEXT: s_endpgm 1516; 1517; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_monotonic_cmpxchg: 1518; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 1519; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1520; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1521; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1522; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1523; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 1524; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 1525; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 1526; 1527; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_monotonic_cmpxchg: 1528; GFX90A-TGSPLIT: ; %bb.0: ; %entry 1529; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1530; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1531; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1532; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1533; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 1534; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 1535; GFX90A-TGSPLIT-NEXT: s_endpgm 1536; 1537; 1538 i32* %out, i32 %in, i32 %old) { 1539entry: 1540 %gep = getelementptr i32, i32* %out, i32 4 1541 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") release monotonic 1542 ret void 1543} 1544 1545define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( 1546; GFX7-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: 1547; GFX7: ; %bb.0: ; %entry 1548; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1549; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 1550; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1551; GFX7-NEXT: s_add_u32 s0, s0, 16 1552; GFX7-NEXT: s_addc_u32 s1, s1, 0 1553; GFX7-NEXT: v_mov_b32_e32 v0, s0 1554; GFX7-NEXT: v_mov_b32_e32 v2, s2 1555; GFX7-NEXT: v_mov_b32_e32 v1, s1 1556; GFX7-NEXT: v_mov_b32_e32 v3, s3 1557; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1558; GFX7-NEXT: s_endpgm 1559; 1560; GFX10-WGP-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: 1561; GFX10-WGP: ; %bb.0: ; %entry 1562; GFX10-WGP-NEXT: s_clause 0x1 1563; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1564; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1565; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1566; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 1567; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 1568; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1569; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 1570; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 1571; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 1572; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1573; GFX10-WGP-NEXT: s_endpgm 1574; 1575; GFX10-CU-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: 1576; GFX10-CU: ; %bb.0: ; %entry 1577; GFX10-CU-NEXT: s_clause 0x1 1578; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1579; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1580; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1581; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 1582; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 1583; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1584; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 1585; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 1586; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 1587; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1588; GFX10-CU-NEXT: s_endpgm 1589; 1590; SKIP-CACHE-INV-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: 1591; SKIP-CACHE-INV: ; %bb.0: ; %entry 1592; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 1593; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1594; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1595; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 1596; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 1597; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1598; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 1599; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 1600; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 1601; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1602; SKIP-CACHE-INV-NEXT: s_endpgm 1603; 1604; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: 1605; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 1606; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1607; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1608; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1609; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1610; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 1611; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 1612; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 1613; 1614; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: 1615; GFX90A-TGSPLIT: ; %bb.0: ; %entry 1616; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1617; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1618; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1619; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1620; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 1621; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 1622; GFX90A-TGSPLIT-NEXT: s_endpgm 1623; 1624; 1625 i32* %out, i32 %in, i32 %old) { 1626entry: 1627 %gep = getelementptr i32, i32* %out, i32 4 1628 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel monotonic 1629 ret void 1630} 1631 1632define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( 1633; GFX7-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: 1634; GFX7: ; %bb.0: ; %entry 1635; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1636; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 1637; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1638; GFX7-NEXT: s_add_u32 s0, s0, 16 1639; GFX7-NEXT: s_addc_u32 s1, s1, 0 1640; GFX7-NEXT: v_mov_b32_e32 v0, s0 1641; GFX7-NEXT: v_mov_b32_e32 v2, s2 1642; GFX7-NEXT: v_mov_b32_e32 v1, s1 1643; GFX7-NEXT: v_mov_b32_e32 v3, s3 1644; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1645; GFX7-NEXT: s_endpgm 1646; 1647; GFX10-WGP-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: 1648; GFX10-WGP: ; %bb.0: ; %entry 1649; GFX10-WGP-NEXT: s_clause 0x1 1650; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1651; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1652; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1653; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 1654; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 1655; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1656; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 1657; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 1658; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 1659; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1660; GFX10-WGP-NEXT: s_endpgm 1661; 1662; GFX10-CU-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: 1663; GFX10-CU: ; %bb.0: ; %entry 1664; GFX10-CU-NEXT: s_clause 0x1 1665; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1666; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1667; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1668; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 1669; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 1670; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1671; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 1672; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 1673; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 1674; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1675; GFX10-CU-NEXT: s_endpgm 1676; 1677; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: 1678; SKIP-CACHE-INV: ; %bb.0: ; %entry 1679; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 1680; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1681; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1682; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 1683; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 1684; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1685; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 1686; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 1687; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 1688; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1689; SKIP-CACHE-INV-NEXT: s_endpgm 1690; 1691; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: 1692; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 1693; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1694; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1695; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1696; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1697; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 1698; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 1699; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 1700; 1701; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: 1702; GFX90A-TGSPLIT: ; %bb.0: ; %entry 1703; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1704; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1705; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1706; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1707; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 1708; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 1709; GFX90A-TGSPLIT-NEXT: s_endpgm 1710; 1711; 1712 i32* %out, i32 %in, i32 %old) { 1713entry: 1714 %gep = getelementptr i32, i32* %out, i32 4 1715 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst monotonic 1716 ret void 1717} 1718 1719define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( 1720; GFX7-LABEL: flat_wavefront_acquire_acquire_cmpxchg: 1721; GFX7: ; %bb.0: ; %entry 1722; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1723; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 1724; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1725; GFX7-NEXT: s_add_u32 s0, s0, 16 1726; GFX7-NEXT: s_addc_u32 s1, s1, 0 1727; GFX7-NEXT: v_mov_b32_e32 v0, s0 1728; GFX7-NEXT: v_mov_b32_e32 v2, s2 1729; GFX7-NEXT: v_mov_b32_e32 v1, s1 1730; GFX7-NEXT: v_mov_b32_e32 v3, s3 1731; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1732; GFX7-NEXT: s_endpgm 1733; 1734; GFX10-WGP-LABEL: flat_wavefront_acquire_acquire_cmpxchg: 1735; GFX10-WGP: ; %bb.0: ; %entry 1736; GFX10-WGP-NEXT: s_clause 0x1 1737; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1738; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1739; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1740; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 1741; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 1742; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1743; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 1744; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 1745; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 1746; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1747; GFX10-WGP-NEXT: s_endpgm 1748; 1749; GFX10-CU-LABEL: flat_wavefront_acquire_acquire_cmpxchg: 1750; GFX10-CU: ; %bb.0: ; %entry 1751; GFX10-CU-NEXT: s_clause 0x1 1752; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1753; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1754; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1755; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 1756; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 1757; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1758; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 1759; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 1760; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 1761; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1762; GFX10-CU-NEXT: s_endpgm 1763; 1764; SKIP-CACHE-INV-LABEL: flat_wavefront_acquire_acquire_cmpxchg: 1765; SKIP-CACHE-INV: ; %bb.0: ; %entry 1766; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 1767; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1768; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1769; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 1770; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 1771; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1772; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 1773; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 1774; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 1775; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1776; SKIP-CACHE-INV-NEXT: s_endpgm 1777; 1778; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_acquire_cmpxchg: 1779; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 1780; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1781; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1782; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1783; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1784; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 1785; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 1786; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 1787; 1788; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_acquire_cmpxchg: 1789; GFX90A-TGSPLIT: ; %bb.0: ; %entry 1790; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1791; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1792; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1793; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1794; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 1795; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 1796; GFX90A-TGSPLIT-NEXT: s_endpgm 1797; 1798; 1799 i32* %out, i32 %in, i32 %old) { 1800entry: 1801 %gep = getelementptr i32, i32* %out, i32 4 1802 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") acquire acquire 1803 ret void 1804} 1805 1806define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( 1807; GFX7-LABEL: flat_wavefront_release_acquire_cmpxchg: 1808; GFX7: ; %bb.0: ; %entry 1809; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1810; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 1811; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1812; GFX7-NEXT: s_add_u32 s0, s0, 16 1813; GFX7-NEXT: s_addc_u32 s1, s1, 0 1814; GFX7-NEXT: v_mov_b32_e32 v0, s0 1815; GFX7-NEXT: v_mov_b32_e32 v2, s2 1816; GFX7-NEXT: v_mov_b32_e32 v1, s1 1817; GFX7-NEXT: v_mov_b32_e32 v3, s3 1818; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1819; GFX7-NEXT: s_endpgm 1820; 1821; GFX10-WGP-LABEL: flat_wavefront_release_acquire_cmpxchg: 1822; GFX10-WGP: ; %bb.0: ; %entry 1823; GFX10-WGP-NEXT: s_clause 0x1 1824; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1825; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1826; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1827; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 1828; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 1829; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1830; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 1831; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 1832; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 1833; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1834; GFX10-WGP-NEXT: s_endpgm 1835; 1836; GFX10-CU-LABEL: flat_wavefront_release_acquire_cmpxchg: 1837; GFX10-CU: ; %bb.0: ; %entry 1838; GFX10-CU-NEXT: s_clause 0x1 1839; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1840; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1841; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1842; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 1843; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 1844; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1845; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 1846; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 1847; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 1848; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1849; GFX10-CU-NEXT: s_endpgm 1850; 1851; SKIP-CACHE-INV-LABEL: flat_wavefront_release_acquire_cmpxchg: 1852; SKIP-CACHE-INV: ; %bb.0: ; %entry 1853; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 1854; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1855; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1856; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 1857; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 1858; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1859; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 1860; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 1861; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 1862; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1863; SKIP-CACHE-INV-NEXT: s_endpgm 1864; 1865; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_acquire_cmpxchg: 1866; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 1867; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1868; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1869; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1870; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1871; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 1872; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 1873; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 1874; 1875; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_acquire_cmpxchg: 1876; GFX90A-TGSPLIT: ; %bb.0: ; %entry 1877; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1878; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1879; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1880; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1881; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 1882; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 1883; GFX90A-TGSPLIT-NEXT: s_endpgm 1884; 1885; 1886 i32* %out, i32 %in, i32 %old) { 1887entry: 1888 %gep = getelementptr i32, i32* %out, i32 4 1889 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") release acquire 1890 ret void 1891} 1892 1893define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( 1894; GFX7-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: 1895; GFX7: ; %bb.0: ; %entry 1896; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1897; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 1898; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1899; GFX7-NEXT: s_add_u32 s0, s0, 16 1900; GFX7-NEXT: s_addc_u32 s1, s1, 0 1901; GFX7-NEXT: v_mov_b32_e32 v0, s0 1902; GFX7-NEXT: v_mov_b32_e32 v2, s2 1903; GFX7-NEXT: v_mov_b32_e32 v1, s1 1904; GFX7-NEXT: v_mov_b32_e32 v3, s3 1905; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1906; GFX7-NEXT: s_endpgm 1907; 1908; GFX10-WGP-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: 1909; GFX10-WGP: ; %bb.0: ; %entry 1910; GFX10-WGP-NEXT: s_clause 0x1 1911; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1912; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1913; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1914; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 1915; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 1916; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1917; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 1918; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 1919; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 1920; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1921; GFX10-WGP-NEXT: s_endpgm 1922; 1923; GFX10-CU-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: 1924; GFX10-CU: ; %bb.0: ; %entry 1925; GFX10-CU-NEXT: s_clause 0x1 1926; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1927; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1928; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1929; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 1930; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 1931; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1932; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 1933; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 1934; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 1935; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1936; GFX10-CU-NEXT: s_endpgm 1937; 1938; SKIP-CACHE-INV-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: 1939; SKIP-CACHE-INV: ; %bb.0: ; %entry 1940; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 1941; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1942; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1943; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 1944; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 1945; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1946; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 1947; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 1948; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 1949; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1950; SKIP-CACHE-INV-NEXT: s_endpgm 1951; 1952; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: 1953; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 1954; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1955; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1956; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1957; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1958; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 1959; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 1960; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 1961; 1962; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: 1963; GFX90A-TGSPLIT: ; %bb.0: ; %entry 1964; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1965; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1966; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1967; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 1968; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 1969; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 1970; GFX90A-TGSPLIT-NEXT: s_endpgm 1971; 1972; 1973 i32* %out, i32 %in, i32 %old) { 1974entry: 1975 %gep = getelementptr i32, i32* %out, i32 4 1976 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel acquire 1977 ret void 1978} 1979 1980define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( 1981; GFX7-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: 1982; GFX7: ; %bb.0: ; %entry 1983; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1984; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 1985; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1986; GFX7-NEXT: s_add_u32 s0, s0, 16 1987; GFX7-NEXT: s_addc_u32 s1, s1, 0 1988; GFX7-NEXT: v_mov_b32_e32 v0, s0 1989; GFX7-NEXT: v_mov_b32_e32 v2, s2 1990; GFX7-NEXT: v_mov_b32_e32 v1, s1 1991; GFX7-NEXT: v_mov_b32_e32 v3, s3 1992; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1993; GFX7-NEXT: s_endpgm 1994; 1995; GFX10-WGP-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: 1996; GFX10-WGP: ; %bb.0: ; %entry 1997; GFX10-WGP-NEXT: s_clause 0x1 1998; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1999; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2000; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2001; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 2002; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 2003; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 2004; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 2005; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 2006; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 2007; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2008; GFX10-WGP-NEXT: s_endpgm 2009; 2010; GFX10-CU-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: 2011; GFX10-CU: ; %bb.0: ; %entry 2012; GFX10-CU-NEXT: s_clause 0x1 2013; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2014; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2015; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2016; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 2017; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 2018; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 2019; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 2020; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 2021; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 2022; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2023; GFX10-CU-NEXT: s_endpgm 2024; 2025; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: 2026; SKIP-CACHE-INV: ; %bb.0: ; %entry 2027; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 2028; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 2029; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2030; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 2031; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 2032; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 2033; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 2034; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 2035; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 2036; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2037; SKIP-CACHE-INV-NEXT: s_endpgm 2038; 2039; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: 2040; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 2041; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2042; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2043; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2044; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2045; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2046; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2047; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 2048; 2049; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: 2050; GFX90A-TGSPLIT: ; %bb.0: ; %entry 2051; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2052; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2053; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2054; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2055; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2056; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2057; GFX90A-TGSPLIT-NEXT: s_endpgm 2058; 2059; 2060 i32* %out, i32 %in, i32 %old) { 2061entry: 2062 %gep = getelementptr i32, i32* %out, i32 4 2063 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst acquire 2064 ret void 2065} 2066 2067define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( 2068; GFX7-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: 2069; GFX7: ; %bb.0: ; %entry 2070; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2071; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 2072; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2073; GFX7-NEXT: s_add_u32 s0, s0, 16 2074; GFX7-NEXT: s_addc_u32 s1, s1, 0 2075; GFX7-NEXT: v_mov_b32_e32 v0, s0 2076; GFX7-NEXT: v_mov_b32_e32 v2, s2 2077; GFX7-NEXT: v_mov_b32_e32 v1, s1 2078; GFX7-NEXT: v_mov_b32_e32 v3, s3 2079; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2080; GFX7-NEXT: s_endpgm 2081; 2082; GFX10-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: 2083; GFX10-WGP: ; %bb.0: ; %entry 2084; GFX10-WGP-NEXT: s_clause 0x1 2085; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2086; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2087; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2088; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 2089; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 2090; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 2091; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 2092; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 2093; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 2094; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2095; GFX10-WGP-NEXT: s_endpgm 2096; 2097; GFX10-CU-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: 2098; GFX10-CU: ; %bb.0: ; %entry 2099; GFX10-CU-NEXT: s_clause 0x1 2100; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2101; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2102; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2103; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 2104; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 2105; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 2106; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 2107; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 2108; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 2109; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2110; GFX10-CU-NEXT: s_endpgm 2111; 2112; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: 2113; SKIP-CACHE-INV: ; %bb.0: ; %entry 2114; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 2115; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 2116; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2117; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 2118; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 2119; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 2120; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 2121; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 2122; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 2123; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2124; SKIP-CACHE-INV-NEXT: s_endpgm 2125; 2126; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: 2127; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 2128; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2129; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2130; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2131; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2132; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2133; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2134; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 2135; 2136; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: 2137; GFX90A-TGSPLIT: ; %bb.0: ; %entry 2138; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2139; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2140; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2141; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2142; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2143; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2144; GFX90A-TGSPLIT-NEXT: s_endpgm 2145; 2146; 2147 i32* %out, i32 %in, i32 %old) { 2148entry: 2149 %gep = getelementptr i32, i32* %out, i32 4 2150 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst seq_cst 2151 ret void 2152} 2153 2154define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( 2155; GFX7-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: 2156; GFX7: ; %bb.0: ; %entry 2157; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2158; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 2159; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2160; GFX7-NEXT: s_add_u32 s4, s0, 16 2161; GFX7-NEXT: s_addc_u32 s5, s1, 0 2162; GFX7-NEXT: v_mov_b32_e32 v0, s4 2163; GFX7-NEXT: v_mov_b32_e32 v2, s2 2164; GFX7-NEXT: v_mov_b32_e32 v1, s5 2165; GFX7-NEXT: v_mov_b32_e32 v3, s3 2166; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2167; GFX7-NEXT: v_mov_b32_e32 v0, s0 2168; GFX7-NEXT: v_mov_b32_e32 v1, s1 2169; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2170; GFX7-NEXT: flat_store_dword v[0:1], v2 2171; GFX7-NEXT: s_endpgm 2172; 2173; GFX10-WGP-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: 2174; GFX10-WGP: ; %bb.0: ; %entry 2175; GFX10-WGP-NEXT: s_clause 0x1 2176; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2177; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2178; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2179; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 2180; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 2181; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 2182; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 2183; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 2184; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 2185; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2186; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 2187; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 2188; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2189; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 2190; GFX10-WGP-NEXT: s_endpgm 2191; 2192; GFX10-CU-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: 2193; GFX10-CU: ; %bb.0: ; %entry 2194; GFX10-CU-NEXT: s_clause 0x1 2195; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2196; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2197; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2198; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 2199; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 2200; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 2201; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 2202; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 2203; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 2204; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2205; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 2206; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 2207; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2208; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 2209; GFX10-CU-NEXT: s_endpgm 2210; 2211; SKIP-CACHE-INV-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: 2212; SKIP-CACHE-INV: ; %bb.0: ; %entry 2213; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 2214; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 2215; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2216; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 2217; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 2218; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 2219; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 2220; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 2221; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 2222; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2223; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 2224; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 2225; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2226; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 2227; SKIP-CACHE-INV-NEXT: s_endpgm 2228; 2229; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: 2230; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 2231; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2232; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2233; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2234; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2235; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2236; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 2237; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2238; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 2239; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 2240; 2241; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: 2242; GFX90A-TGSPLIT: ; %bb.0: ; %entry 2243; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2244; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2245; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2246; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2247; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2248; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 2249; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 2250; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 2251; GFX90A-TGSPLIT-NEXT: s_endpgm 2252; 2253; 2254 i32* %out, i32 %in, i32 %old) { 2255entry: 2256 %gep = getelementptr i32, i32* %out, i32 4 2257 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") acquire monotonic 2258 %val0 = extractvalue { i32, i1 } %val, 0 2259 store i32 %val0, i32* %out, align 4 2260 ret void 2261} 2262 2263define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( 2264; GFX7-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: 2265; GFX7: ; %bb.0: ; %entry 2266; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2267; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 2268; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2269; GFX7-NEXT: s_add_u32 s4, s0, 16 2270; GFX7-NEXT: s_addc_u32 s5, s1, 0 2271; GFX7-NEXT: v_mov_b32_e32 v0, s4 2272; GFX7-NEXT: v_mov_b32_e32 v2, s2 2273; GFX7-NEXT: v_mov_b32_e32 v1, s5 2274; GFX7-NEXT: v_mov_b32_e32 v3, s3 2275; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2276; GFX7-NEXT: v_mov_b32_e32 v0, s0 2277; GFX7-NEXT: v_mov_b32_e32 v1, s1 2278; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2279; GFX7-NEXT: flat_store_dword v[0:1], v2 2280; GFX7-NEXT: s_endpgm 2281; 2282; GFX10-WGP-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: 2283; GFX10-WGP: ; %bb.0: ; %entry 2284; GFX10-WGP-NEXT: s_clause 0x1 2285; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2286; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2287; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2288; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 2289; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 2290; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 2291; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 2292; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 2293; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 2294; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2295; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 2296; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 2297; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2298; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 2299; GFX10-WGP-NEXT: s_endpgm 2300; 2301; GFX10-CU-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: 2302; GFX10-CU: ; %bb.0: ; %entry 2303; GFX10-CU-NEXT: s_clause 0x1 2304; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2305; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2306; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2307; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 2308; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 2309; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 2310; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 2311; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 2312; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 2313; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2314; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 2315; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 2316; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2317; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 2318; GFX10-CU-NEXT: s_endpgm 2319; 2320; SKIP-CACHE-INV-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: 2321; SKIP-CACHE-INV: ; %bb.0: ; %entry 2322; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 2323; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 2324; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2325; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 2326; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 2327; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 2328; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 2329; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 2330; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 2331; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2332; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 2333; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 2334; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2335; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 2336; SKIP-CACHE-INV-NEXT: s_endpgm 2337; 2338; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: 2339; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 2340; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2341; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2342; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2343; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2344; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2345; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 2346; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2347; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 2348; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 2349; 2350; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: 2351; GFX90A-TGSPLIT: ; %bb.0: ; %entry 2352; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2353; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2354; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2355; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2356; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2357; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 2358; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 2359; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 2360; GFX90A-TGSPLIT-NEXT: s_endpgm 2361; 2362; 2363 i32* %out, i32 %in, i32 %old) { 2364entry: 2365 %gep = getelementptr i32, i32* %out, i32 4 2366 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel monotonic 2367 %val0 = extractvalue { i32, i1 } %val, 0 2368 store i32 %val0, i32* %out, align 4 2369 ret void 2370} 2371 2372define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( 2373; GFX7-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: 2374; GFX7: ; %bb.0: ; %entry 2375; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2376; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 2377; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2378; GFX7-NEXT: s_add_u32 s4, s0, 16 2379; GFX7-NEXT: s_addc_u32 s5, s1, 0 2380; GFX7-NEXT: v_mov_b32_e32 v0, s4 2381; GFX7-NEXT: v_mov_b32_e32 v2, s2 2382; GFX7-NEXT: v_mov_b32_e32 v1, s5 2383; GFX7-NEXT: v_mov_b32_e32 v3, s3 2384; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2385; GFX7-NEXT: v_mov_b32_e32 v0, s0 2386; GFX7-NEXT: v_mov_b32_e32 v1, s1 2387; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2388; GFX7-NEXT: flat_store_dword v[0:1], v2 2389; GFX7-NEXT: s_endpgm 2390; 2391; GFX10-WGP-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: 2392; GFX10-WGP: ; %bb.0: ; %entry 2393; GFX10-WGP-NEXT: s_clause 0x1 2394; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2395; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2396; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2397; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 2398; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 2399; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 2400; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 2401; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 2402; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 2403; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2404; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 2405; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 2406; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2407; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 2408; GFX10-WGP-NEXT: s_endpgm 2409; 2410; GFX10-CU-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: 2411; GFX10-CU: ; %bb.0: ; %entry 2412; GFX10-CU-NEXT: s_clause 0x1 2413; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2414; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2415; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2416; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 2417; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 2418; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 2419; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 2420; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 2421; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 2422; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2423; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 2424; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 2425; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2426; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 2427; GFX10-CU-NEXT: s_endpgm 2428; 2429; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: 2430; SKIP-CACHE-INV: ; %bb.0: ; %entry 2431; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 2432; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 2433; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2434; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 2435; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 2436; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 2437; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 2438; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 2439; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 2440; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2441; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 2442; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 2443; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2444; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 2445; SKIP-CACHE-INV-NEXT: s_endpgm 2446; 2447; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: 2448; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 2449; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2450; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2451; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2452; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2453; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2454; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 2455; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2456; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 2457; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 2458; 2459; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: 2460; GFX90A-TGSPLIT: ; %bb.0: ; %entry 2461; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2462; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2463; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2464; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2465; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2466; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 2467; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 2468; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 2469; GFX90A-TGSPLIT-NEXT: s_endpgm 2470; 2471; 2472 i32* %out, i32 %in, i32 %old) { 2473entry: 2474 %gep = getelementptr i32, i32* %out, i32 4 2475 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst monotonic 2476 %val0 = extractvalue { i32, i1 } %val, 0 2477 store i32 %val0, i32* %out, align 4 2478 ret void 2479} 2480 2481define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( 2482; GFX7-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: 2483; GFX7: ; %bb.0: ; %entry 2484; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2485; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 2486; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2487; GFX7-NEXT: s_add_u32 s4, s0, 16 2488; GFX7-NEXT: s_addc_u32 s5, s1, 0 2489; GFX7-NEXT: v_mov_b32_e32 v0, s4 2490; GFX7-NEXT: v_mov_b32_e32 v2, s2 2491; GFX7-NEXT: v_mov_b32_e32 v1, s5 2492; GFX7-NEXT: v_mov_b32_e32 v3, s3 2493; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2494; GFX7-NEXT: v_mov_b32_e32 v0, s0 2495; GFX7-NEXT: v_mov_b32_e32 v1, s1 2496; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2497; GFX7-NEXT: flat_store_dword v[0:1], v2 2498; GFX7-NEXT: s_endpgm 2499; 2500; GFX10-WGP-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: 2501; GFX10-WGP: ; %bb.0: ; %entry 2502; GFX10-WGP-NEXT: s_clause 0x1 2503; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2504; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2505; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2506; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 2507; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 2508; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 2509; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 2510; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 2511; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 2512; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2513; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 2514; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 2515; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2516; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 2517; GFX10-WGP-NEXT: s_endpgm 2518; 2519; GFX10-CU-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: 2520; GFX10-CU: ; %bb.0: ; %entry 2521; GFX10-CU-NEXT: s_clause 0x1 2522; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2523; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2524; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2525; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 2526; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 2527; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 2528; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 2529; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 2530; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 2531; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2532; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 2533; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 2534; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2535; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 2536; GFX10-CU-NEXT: s_endpgm 2537; 2538; SKIP-CACHE-INV-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: 2539; SKIP-CACHE-INV: ; %bb.0: ; %entry 2540; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 2541; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 2542; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2543; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 2544; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 2545; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 2546; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 2547; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 2548; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 2549; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2550; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 2551; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 2552; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2553; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 2554; SKIP-CACHE-INV-NEXT: s_endpgm 2555; 2556; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: 2557; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 2558; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2559; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2560; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2561; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2562; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2563; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 2564; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2565; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 2566; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 2567; 2568; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: 2569; GFX90A-TGSPLIT: ; %bb.0: ; %entry 2570; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2571; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2572; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2573; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2574; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2575; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 2576; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 2577; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 2578; GFX90A-TGSPLIT-NEXT: s_endpgm 2579; 2580; 2581 i32* %out, i32 %in, i32 %old) { 2582entry: 2583 %gep = getelementptr i32, i32* %out, i32 4 2584 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") acquire acquire 2585 %val0 = extractvalue { i32, i1 } %val, 0 2586 store i32 %val0, i32* %out, align 4 2587 ret void 2588} 2589 2590define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( 2591; GFX7-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: 2592; GFX7: ; %bb.0: ; %entry 2593; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2594; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 2595; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2596; GFX7-NEXT: s_add_u32 s4, s0, 16 2597; GFX7-NEXT: s_addc_u32 s5, s1, 0 2598; GFX7-NEXT: v_mov_b32_e32 v0, s4 2599; GFX7-NEXT: v_mov_b32_e32 v2, s2 2600; GFX7-NEXT: v_mov_b32_e32 v1, s5 2601; GFX7-NEXT: v_mov_b32_e32 v3, s3 2602; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2603; GFX7-NEXT: v_mov_b32_e32 v0, s0 2604; GFX7-NEXT: v_mov_b32_e32 v1, s1 2605; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2606; GFX7-NEXT: flat_store_dword v[0:1], v2 2607; GFX7-NEXT: s_endpgm 2608; 2609; GFX10-WGP-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: 2610; GFX10-WGP: ; %bb.0: ; %entry 2611; GFX10-WGP-NEXT: s_clause 0x1 2612; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2613; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2614; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2615; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 2616; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 2617; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 2618; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 2619; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 2620; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 2621; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2622; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 2623; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 2624; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2625; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 2626; GFX10-WGP-NEXT: s_endpgm 2627; 2628; GFX10-CU-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: 2629; GFX10-CU: ; %bb.0: ; %entry 2630; GFX10-CU-NEXT: s_clause 0x1 2631; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2632; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2633; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2634; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 2635; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 2636; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 2637; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 2638; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 2639; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 2640; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2641; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 2642; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 2643; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2644; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 2645; GFX10-CU-NEXT: s_endpgm 2646; 2647; SKIP-CACHE-INV-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: 2648; SKIP-CACHE-INV: ; %bb.0: ; %entry 2649; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 2650; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 2651; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2652; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 2653; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 2654; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 2655; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 2656; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 2657; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 2658; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2659; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 2660; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 2661; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2662; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 2663; SKIP-CACHE-INV-NEXT: s_endpgm 2664; 2665; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: 2666; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 2667; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2668; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2669; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2670; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2671; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2672; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 2673; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2674; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 2675; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 2676; 2677; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: 2678; GFX90A-TGSPLIT: ; %bb.0: ; %entry 2679; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2680; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2681; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2682; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2683; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2684; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 2685; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 2686; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 2687; GFX90A-TGSPLIT-NEXT: s_endpgm 2688; 2689; 2690 i32* %out, i32 %in, i32 %old) { 2691entry: 2692 %gep = getelementptr i32, i32* %out, i32 4 2693 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") release acquire 2694 %val0 = extractvalue { i32, i1 } %val, 0 2695 store i32 %val0, i32* %out, align 4 2696 ret void 2697} 2698 2699define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( 2700; GFX7-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: 2701; GFX7: ; %bb.0: ; %entry 2702; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2703; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 2704; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2705; GFX7-NEXT: s_add_u32 s4, s0, 16 2706; GFX7-NEXT: s_addc_u32 s5, s1, 0 2707; GFX7-NEXT: v_mov_b32_e32 v0, s4 2708; GFX7-NEXT: v_mov_b32_e32 v2, s2 2709; GFX7-NEXT: v_mov_b32_e32 v1, s5 2710; GFX7-NEXT: v_mov_b32_e32 v3, s3 2711; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2712; GFX7-NEXT: v_mov_b32_e32 v0, s0 2713; GFX7-NEXT: v_mov_b32_e32 v1, s1 2714; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2715; GFX7-NEXT: flat_store_dword v[0:1], v2 2716; GFX7-NEXT: s_endpgm 2717; 2718; GFX10-WGP-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: 2719; GFX10-WGP: ; %bb.0: ; %entry 2720; GFX10-WGP-NEXT: s_clause 0x1 2721; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2722; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2723; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2724; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 2725; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 2726; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 2727; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 2728; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 2729; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 2730; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2731; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 2732; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 2733; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2734; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 2735; GFX10-WGP-NEXT: s_endpgm 2736; 2737; GFX10-CU-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: 2738; GFX10-CU: ; %bb.0: ; %entry 2739; GFX10-CU-NEXT: s_clause 0x1 2740; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2741; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2742; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2743; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 2744; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 2745; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 2746; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 2747; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 2748; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 2749; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2750; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 2751; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 2752; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2753; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 2754; GFX10-CU-NEXT: s_endpgm 2755; 2756; SKIP-CACHE-INV-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: 2757; SKIP-CACHE-INV: ; %bb.0: ; %entry 2758; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 2759; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 2760; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2761; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 2762; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 2763; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 2764; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 2765; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 2766; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 2767; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2768; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 2769; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 2770; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2771; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 2772; SKIP-CACHE-INV-NEXT: s_endpgm 2773; 2774; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: 2775; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 2776; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2777; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2778; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2779; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2780; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2781; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 2782; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2783; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 2784; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 2785; 2786; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: 2787; GFX90A-TGSPLIT: ; %bb.0: ; %entry 2788; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2789; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2790; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2791; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2792; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2793; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 2794; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 2795; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 2796; GFX90A-TGSPLIT-NEXT: s_endpgm 2797; 2798; 2799 i32* %out, i32 %in, i32 %old) { 2800entry: 2801 %gep = getelementptr i32, i32* %out, i32 4 2802 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel acquire 2803 %val0 = extractvalue { i32, i1 } %val, 0 2804 store i32 %val0, i32* %out, align 4 2805 ret void 2806} 2807 2808define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( 2809; GFX7-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: 2810; GFX7: ; %bb.0: ; %entry 2811; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2812; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 2813; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2814; GFX7-NEXT: s_add_u32 s4, s0, 16 2815; GFX7-NEXT: s_addc_u32 s5, s1, 0 2816; GFX7-NEXT: v_mov_b32_e32 v0, s4 2817; GFX7-NEXT: v_mov_b32_e32 v2, s2 2818; GFX7-NEXT: v_mov_b32_e32 v1, s5 2819; GFX7-NEXT: v_mov_b32_e32 v3, s3 2820; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2821; GFX7-NEXT: v_mov_b32_e32 v0, s0 2822; GFX7-NEXT: v_mov_b32_e32 v1, s1 2823; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2824; GFX7-NEXT: flat_store_dword v[0:1], v2 2825; GFX7-NEXT: s_endpgm 2826; 2827; GFX10-WGP-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: 2828; GFX10-WGP: ; %bb.0: ; %entry 2829; GFX10-WGP-NEXT: s_clause 0x1 2830; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2831; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2832; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2833; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 2834; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 2835; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 2836; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 2837; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 2838; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 2839; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2840; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 2841; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 2842; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2843; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 2844; GFX10-WGP-NEXT: s_endpgm 2845; 2846; GFX10-CU-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: 2847; GFX10-CU: ; %bb.0: ; %entry 2848; GFX10-CU-NEXT: s_clause 0x1 2849; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2850; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2851; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2852; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 2853; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 2854; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 2855; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 2856; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 2857; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 2858; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2859; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 2860; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 2861; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2862; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 2863; GFX10-CU-NEXT: s_endpgm 2864; 2865; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: 2866; SKIP-CACHE-INV: ; %bb.0: ; %entry 2867; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 2868; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 2869; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2870; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 2871; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 2872; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 2873; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 2874; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 2875; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 2876; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2877; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 2878; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 2879; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2880; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 2881; SKIP-CACHE-INV-NEXT: s_endpgm 2882; 2883; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: 2884; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 2885; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2886; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2887; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2888; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2889; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2890; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 2891; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2892; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 2893; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 2894; 2895; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: 2896; GFX90A-TGSPLIT: ; %bb.0: ; %entry 2897; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2898; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2899; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2900; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2901; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2902; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 2903; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 2904; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 2905; GFX90A-TGSPLIT-NEXT: s_endpgm 2906; 2907; 2908 i32* %out, i32 %in, i32 %old) { 2909entry: 2910 %gep = getelementptr i32, i32* %out, i32 4 2911 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst acquire 2912 %val0 = extractvalue { i32, i1 } %val, 0 2913 store i32 %val0, i32* %out, align 4 2914 ret void 2915} 2916 2917define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( 2918; GFX7-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: 2919; GFX7: ; %bb.0: ; %entry 2920; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2921; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 2922; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2923; GFX7-NEXT: s_add_u32 s4, s0, 16 2924; GFX7-NEXT: s_addc_u32 s5, s1, 0 2925; GFX7-NEXT: v_mov_b32_e32 v0, s4 2926; GFX7-NEXT: v_mov_b32_e32 v2, s2 2927; GFX7-NEXT: v_mov_b32_e32 v1, s5 2928; GFX7-NEXT: v_mov_b32_e32 v3, s3 2929; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2930; GFX7-NEXT: v_mov_b32_e32 v0, s0 2931; GFX7-NEXT: v_mov_b32_e32 v1, s1 2932; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2933; GFX7-NEXT: flat_store_dword v[0:1], v2 2934; GFX7-NEXT: s_endpgm 2935; 2936; GFX10-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: 2937; GFX10-WGP: ; %bb.0: ; %entry 2938; GFX10-WGP-NEXT: s_clause 0x1 2939; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2940; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2941; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2942; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 2943; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 2944; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 2945; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 2946; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 2947; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 2948; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2949; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 2950; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 2951; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2952; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 2953; GFX10-WGP-NEXT: s_endpgm 2954; 2955; GFX10-CU-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: 2956; GFX10-CU: ; %bb.0: ; %entry 2957; GFX10-CU-NEXT: s_clause 0x1 2958; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2959; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2960; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2961; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 2962; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 2963; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 2964; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 2965; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 2966; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 2967; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2968; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 2969; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 2970; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2971; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 2972; GFX10-CU-NEXT: s_endpgm 2973; 2974; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: 2975; SKIP-CACHE-INV: ; %bb.0: ; %entry 2976; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 2977; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 2978; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2979; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 2980; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 2981; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 2982; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 2983; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 2984; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 2985; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2986; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 2987; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 2988; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2989; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 2990; SKIP-CACHE-INV-NEXT: s_endpgm 2991; 2992; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: 2993; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 2994; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2995; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2996; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2997; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 2998; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 2999; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 3000; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3001; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 3002; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 3003; 3004; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: 3005; GFX90A-TGSPLIT: ; %bb.0: ; %entry 3006; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3007; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3008; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3009; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 3010; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 3011; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 3012; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 3013; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 3014; GFX90A-TGSPLIT-NEXT: s_endpgm 3015; 3016; 3017 i32* %out, i32 %in, i32 %old) { 3018entry: 3019 %gep = getelementptr i32, i32* %out, i32 4 3020 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst seq_cst 3021 %val0 = extractvalue { i32, i1 } %val, 0 3022 store i32 %val0, i32* %out, align 4 3023 ret void 3024} 3025 3026define amdgpu_kernel void @flat_wavefront_one_as_unordered_load( 3027; GFX7-LABEL: flat_wavefront_one_as_unordered_load: 3028; GFX7: ; %bb.0: ; %entry 3029; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 3030; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3031; GFX7-NEXT: v_mov_b32_e32 v0, s0 3032; GFX7-NEXT: v_mov_b32_e32 v1, s1 3033; GFX7-NEXT: flat_load_dword v0, v[0:1] 3034; GFX7-NEXT: v_mov_b32_e32 v2, s2 3035; GFX7-NEXT: v_mov_b32_e32 v3, s3 3036; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3037; GFX7-NEXT: flat_store_dword v[2:3], v0 3038; GFX7-NEXT: s_endpgm 3039; 3040; GFX10-WGP-LABEL: flat_wavefront_one_as_unordered_load: 3041; GFX10-WGP: ; %bb.0: ; %entry 3042; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 3043; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3044; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 3045; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 3046; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] 3047; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 3048; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 3049; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3050; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 3051; GFX10-WGP-NEXT: s_endpgm 3052; 3053; GFX10-CU-LABEL: flat_wavefront_one_as_unordered_load: 3054; GFX10-CU: ; %bb.0: ; %entry 3055; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 3056; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3057; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 3058; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 3059; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] 3060; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 3061; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 3062; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3063; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 3064; GFX10-CU-NEXT: s_endpgm 3065; 3066; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_unordered_load: 3067; SKIP-CACHE-INV: ; %bb.0: ; %entry 3068; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 3069; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3070; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 3071; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 3072; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] 3073; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 3074; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 3075; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3076; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 3077; SKIP-CACHE-INV-NEXT: s_endpgm 3078; 3079; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_unordered_load: 3080; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 3081; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 3082; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3083; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 3084; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 3085; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] 3086; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 3087; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 3088; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3089; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 3090; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 3091; 3092; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_unordered_load: 3093; GFX90A-TGSPLIT: ; %bb.0: ; %entry 3094; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 3095; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3096; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 3097; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 3098; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] 3099; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 3100; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 3101; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 3102; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 3103; GFX90A-TGSPLIT-NEXT: s_endpgm 3104; 3105; 3106 i32* %in, i32* %out) { 3107entry: 3108 %val = load atomic i32, i32* %in syncscope("wavefront-one-as") unordered, align 4 3109 store i32 %val, i32* %out 3110 ret void 3111} 3112 3113define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load( 3114; GFX7-LABEL: flat_wavefront_one_as_monotonic_load: 3115; GFX7: ; %bb.0: ; %entry 3116; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 3117; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3118; GFX7-NEXT: v_mov_b32_e32 v0, s0 3119; GFX7-NEXT: v_mov_b32_e32 v1, s1 3120; GFX7-NEXT: flat_load_dword v0, v[0:1] 3121; GFX7-NEXT: v_mov_b32_e32 v2, s2 3122; GFX7-NEXT: v_mov_b32_e32 v3, s3 3123; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3124; GFX7-NEXT: flat_store_dword v[2:3], v0 3125; GFX7-NEXT: s_endpgm 3126; 3127; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_load: 3128; GFX10-WGP: ; %bb.0: ; %entry 3129; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 3130; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3131; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 3132; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 3133; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] 3134; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 3135; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 3136; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3137; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 3138; GFX10-WGP-NEXT: s_endpgm 3139; 3140; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_load: 3141; GFX10-CU: ; %bb.0: ; %entry 3142; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 3143; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3144; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 3145; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 3146; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] 3147; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 3148; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 3149; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3150; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 3151; GFX10-CU-NEXT: s_endpgm 3152; 3153; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_monotonic_load: 3154; SKIP-CACHE-INV: ; %bb.0: ; %entry 3155; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 3156; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3157; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 3158; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 3159; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] 3160; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 3161; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 3162; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3163; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 3164; SKIP-CACHE-INV-NEXT: s_endpgm 3165; 3166; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_load: 3167; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 3168; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 3169; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3170; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 3171; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 3172; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] 3173; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 3174; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 3175; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3176; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 3177; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 3178; 3179; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_load: 3180; GFX90A-TGSPLIT: ; %bb.0: ; %entry 3181; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 3182; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3183; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 3184; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 3185; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] 3186; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 3187; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 3188; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 3189; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 3190; GFX90A-TGSPLIT-NEXT: s_endpgm 3191; 3192; 3193 i32* %in, i32* %out) { 3194entry: 3195 %val = load atomic i32, i32* %in syncscope("wavefront-one-as") monotonic, align 4 3196 store i32 %val, i32* %out 3197 ret void 3198} 3199 3200define amdgpu_kernel void @flat_wavefront_one_as_acquire_load( 3201; GFX7-LABEL: flat_wavefront_one_as_acquire_load: 3202; GFX7: ; %bb.0: ; %entry 3203; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 3204; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3205; GFX7-NEXT: v_mov_b32_e32 v0, s0 3206; GFX7-NEXT: v_mov_b32_e32 v1, s1 3207; GFX7-NEXT: flat_load_dword v0, v[0:1] 3208; GFX7-NEXT: v_mov_b32_e32 v2, s2 3209; GFX7-NEXT: v_mov_b32_e32 v3, s3 3210; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3211; GFX7-NEXT: flat_store_dword v[2:3], v0 3212; GFX7-NEXT: s_endpgm 3213; 3214; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_load: 3215; GFX10-WGP: ; %bb.0: ; %entry 3216; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 3217; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3218; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 3219; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 3220; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] 3221; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 3222; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 3223; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3224; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 3225; GFX10-WGP-NEXT: s_endpgm 3226; 3227; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_load: 3228; GFX10-CU: ; %bb.0: ; %entry 3229; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 3230; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3231; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 3232; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 3233; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] 3234; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 3235; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 3236; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3237; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 3238; GFX10-CU-NEXT: s_endpgm 3239; 3240; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_load: 3241; SKIP-CACHE-INV: ; %bb.0: ; %entry 3242; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 3243; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3244; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 3245; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 3246; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] 3247; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 3248; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 3249; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3250; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 3251; SKIP-CACHE-INV-NEXT: s_endpgm 3252; 3253; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_load: 3254; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 3255; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 3256; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3257; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 3258; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 3259; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] 3260; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 3261; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 3262; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3263; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 3264; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 3265; 3266; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_load: 3267; GFX90A-TGSPLIT: ; %bb.0: ; %entry 3268; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 3269; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3270; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 3271; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 3272; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] 3273; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 3274; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 3275; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 3276; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 3277; GFX90A-TGSPLIT-NEXT: s_endpgm 3278; 3279; 3280 i32* %in, i32* %out) { 3281entry: 3282 %val = load atomic i32, i32* %in syncscope("wavefront-one-as") acquire, align 4 3283 store i32 %val, i32* %out 3284 ret void 3285} 3286 3287define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load( 3288; GFX7-LABEL: flat_wavefront_one_as_seq_cst_load: 3289; GFX7: ; %bb.0: ; %entry 3290; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 3291; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3292; GFX7-NEXT: v_mov_b32_e32 v0, s0 3293; GFX7-NEXT: v_mov_b32_e32 v1, s1 3294; GFX7-NEXT: flat_load_dword v0, v[0:1] 3295; GFX7-NEXT: v_mov_b32_e32 v2, s2 3296; GFX7-NEXT: v_mov_b32_e32 v3, s3 3297; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3298; GFX7-NEXT: flat_store_dword v[2:3], v0 3299; GFX7-NEXT: s_endpgm 3300; 3301; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_load: 3302; GFX10-WGP: ; %bb.0: ; %entry 3303; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 3304; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3305; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 3306; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 3307; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] 3308; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 3309; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 3310; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3311; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 3312; GFX10-WGP-NEXT: s_endpgm 3313; 3314; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_load: 3315; GFX10-CU: ; %bb.0: ; %entry 3316; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 3317; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3318; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 3319; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 3320; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] 3321; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 3322; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 3323; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3324; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 3325; GFX10-CU-NEXT: s_endpgm 3326; 3327; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_load: 3328; SKIP-CACHE-INV: ; %bb.0: ; %entry 3329; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 3330; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3331; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 3332; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 3333; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] 3334; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 3335; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 3336; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3337; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 3338; SKIP-CACHE-INV-NEXT: s_endpgm 3339; 3340; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_load: 3341; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 3342; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 3343; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3344; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 3345; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 3346; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] 3347; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 3348; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 3349; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3350; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 3351; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 3352; 3353; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_load: 3354; GFX90A-TGSPLIT: ; %bb.0: ; %entry 3355; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 3356; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3357; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 3358; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 3359; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] 3360; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 3361; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 3362; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 3363; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 3364; GFX90A-TGSPLIT-NEXT: s_endpgm 3365; 3366; 3367 i32* %in, i32* %out) { 3368entry: 3369 %val = load atomic i32, i32* %in syncscope("wavefront-one-as") seq_cst, align 4 3370 store i32 %val, i32* %out 3371 ret void 3372} 3373 3374define amdgpu_kernel void @flat_wavefront_one_as_unordered_store( 3375; GFX7-LABEL: flat_wavefront_one_as_unordered_store: 3376; GFX7: ; %bb.0: ; %entry 3377; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 3378; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 3379; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3380; GFX7-NEXT: v_mov_b32_e32 v2, s2 3381; GFX7-NEXT: v_mov_b32_e32 v0, s0 3382; GFX7-NEXT: v_mov_b32_e32 v1, s1 3383; GFX7-NEXT: flat_store_dword v[0:1], v2 3384; GFX7-NEXT: s_endpgm 3385; 3386; GFX10-WGP-LABEL: flat_wavefront_one_as_unordered_store: 3387; GFX10-WGP: ; %bb.0: ; %entry 3388; GFX10-WGP-NEXT: s_clause 0x1 3389; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 3390; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 3391; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3392; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 3393; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 3394; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 3395; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 3396; GFX10-WGP-NEXT: s_endpgm 3397; 3398; GFX10-CU-LABEL: flat_wavefront_one_as_unordered_store: 3399; GFX10-CU: ; %bb.0: ; %entry 3400; GFX10-CU-NEXT: s_clause 0x1 3401; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 3402; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 3403; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3404; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 3405; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 3406; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 3407; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 3408; GFX10-CU-NEXT: s_endpgm 3409; 3410; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_unordered_store: 3411; SKIP-CACHE-INV: ; %bb.0: ; %entry 3412; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 3413; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 3414; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3415; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 3416; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 3417; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 3418; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 3419; SKIP-CACHE-INV-NEXT: s_endpgm 3420; 3421; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_unordered_store: 3422; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 3423; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 3424; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 3425; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3426; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 3427; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 3428; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 3429; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 3430; 3431; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_unordered_store: 3432; GFX90A-TGSPLIT: ; %bb.0: ; %entry 3433; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 3434; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 3435; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3436; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 3437; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 3438; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 3439; GFX90A-TGSPLIT-NEXT: s_endpgm 3440; 3441; 3442 i32 %in, i32* %out) { 3443entry: 3444 store atomic i32 %in, i32* %out syncscope("wavefront-one-as") unordered, align 4 3445 ret void 3446} 3447 3448define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store( 3449; GFX7-LABEL: flat_wavefront_one_as_monotonic_store: 3450; GFX7: ; %bb.0: ; %entry 3451; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 3452; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 3453; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3454; GFX7-NEXT: v_mov_b32_e32 v2, s2 3455; GFX7-NEXT: v_mov_b32_e32 v0, s0 3456; GFX7-NEXT: v_mov_b32_e32 v1, s1 3457; GFX7-NEXT: flat_store_dword v[0:1], v2 3458; GFX7-NEXT: s_endpgm 3459; 3460; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_store: 3461; GFX10-WGP: ; %bb.0: ; %entry 3462; GFX10-WGP-NEXT: s_clause 0x1 3463; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 3464; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 3465; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3466; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 3467; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 3468; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 3469; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 3470; GFX10-WGP-NEXT: s_endpgm 3471; 3472; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_store: 3473; GFX10-CU: ; %bb.0: ; %entry 3474; GFX10-CU-NEXT: s_clause 0x1 3475; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 3476; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 3477; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3478; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 3479; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 3480; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 3481; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 3482; GFX10-CU-NEXT: s_endpgm 3483; 3484; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_monotonic_store: 3485; SKIP-CACHE-INV: ; %bb.0: ; %entry 3486; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 3487; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 3488; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3489; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 3490; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 3491; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 3492; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 3493; SKIP-CACHE-INV-NEXT: s_endpgm 3494; 3495; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_store: 3496; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 3497; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 3498; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 3499; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3500; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 3501; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 3502; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 3503; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 3504; 3505; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_store: 3506; GFX90A-TGSPLIT: ; %bb.0: ; %entry 3507; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 3508; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 3509; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3510; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 3511; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 3512; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 3513; GFX90A-TGSPLIT-NEXT: s_endpgm 3514; 3515; 3516 i32 %in, i32* %out) { 3517entry: 3518 store atomic i32 %in, i32* %out syncscope("wavefront-one-as") monotonic, align 4 3519 ret void 3520} 3521 3522define amdgpu_kernel void @flat_wavefront_one_as_release_store( 3523; GFX7-LABEL: flat_wavefront_one_as_release_store: 3524; GFX7: ; %bb.0: ; %entry 3525; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 3526; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 3527; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3528; GFX7-NEXT: v_mov_b32_e32 v2, s2 3529; GFX7-NEXT: v_mov_b32_e32 v0, s0 3530; GFX7-NEXT: v_mov_b32_e32 v1, s1 3531; GFX7-NEXT: flat_store_dword v[0:1], v2 3532; GFX7-NEXT: s_endpgm 3533; 3534; GFX10-WGP-LABEL: flat_wavefront_one_as_release_store: 3535; GFX10-WGP: ; %bb.0: ; %entry 3536; GFX10-WGP-NEXT: s_clause 0x1 3537; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 3538; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 3539; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3540; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 3541; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 3542; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 3543; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 3544; GFX10-WGP-NEXT: s_endpgm 3545; 3546; GFX10-CU-LABEL: flat_wavefront_one_as_release_store: 3547; GFX10-CU: ; %bb.0: ; %entry 3548; GFX10-CU-NEXT: s_clause 0x1 3549; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 3550; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 3551; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3552; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 3553; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 3554; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 3555; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 3556; GFX10-CU-NEXT: s_endpgm 3557; 3558; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_release_store: 3559; SKIP-CACHE-INV: ; %bb.0: ; %entry 3560; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 3561; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 3562; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3563; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 3564; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 3565; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 3566; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 3567; SKIP-CACHE-INV-NEXT: s_endpgm 3568; 3569; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_store: 3570; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 3571; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 3572; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 3573; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3574; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 3575; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 3576; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 3577; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 3578; 3579; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_store: 3580; GFX90A-TGSPLIT: ; %bb.0: ; %entry 3581; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 3582; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 3583; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3584; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 3585; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 3586; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 3587; GFX90A-TGSPLIT-NEXT: s_endpgm 3588; 3589; 3590 i32 %in, i32* %out) { 3591entry: 3592 store atomic i32 %in, i32* %out syncscope("wavefront-one-as") release, align 4 3593 ret void 3594} 3595 3596define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( 3597; GFX7-LABEL: flat_wavefront_one_as_seq_cst_store: 3598; GFX7: ; %bb.0: ; %entry 3599; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 3600; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 3601; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3602; GFX7-NEXT: v_mov_b32_e32 v2, s2 3603; GFX7-NEXT: v_mov_b32_e32 v0, s0 3604; GFX7-NEXT: v_mov_b32_e32 v1, s1 3605; GFX7-NEXT: flat_store_dword v[0:1], v2 3606; GFX7-NEXT: s_endpgm 3607; 3608; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_store: 3609; GFX10-WGP: ; %bb.0: ; %entry 3610; GFX10-WGP-NEXT: s_clause 0x1 3611; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 3612; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 3613; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3614; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 3615; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 3616; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 3617; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 3618; GFX10-WGP-NEXT: s_endpgm 3619; 3620; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_store: 3621; GFX10-CU: ; %bb.0: ; %entry 3622; GFX10-CU-NEXT: s_clause 0x1 3623; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 3624; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 3625; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3626; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 3627; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 3628; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 3629; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 3630; GFX10-CU-NEXT: s_endpgm 3631; 3632; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_store: 3633; SKIP-CACHE-INV: ; %bb.0: ; %entry 3634; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 3635; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 3636; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3637; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 3638; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 3639; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 3640; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 3641; SKIP-CACHE-INV-NEXT: s_endpgm 3642; 3643; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_store: 3644; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 3645; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 3646; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 3647; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3648; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 3649; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 3650; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 3651; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 3652; 3653; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_store: 3654; GFX90A-TGSPLIT: ; %bb.0: ; %entry 3655; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 3656; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 3657; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3658; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 3659; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 3660; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 3661; GFX90A-TGSPLIT-NEXT: s_endpgm 3662; 3663; 3664 i32 %in, i32* %out) { 3665entry: 3666 store atomic i32 %in, i32* %out syncscope("wavefront-one-as") seq_cst, align 4 3667 ret void 3668} 3669 3670define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw( 3671; GFX7-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: 3672; GFX7: ; %bb.0: ; %entry 3673; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3674; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 3675; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3676; GFX7-NEXT: v_mov_b32_e32 v0, s0 3677; GFX7-NEXT: v_mov_b32_e32 v1, s1 3678; GFX7-NEXT: v_mov_b32_e32 v2, s2 3679; GFX7-NEXT: flat_atomic_swap v[0:1], v2 3680; GFX7-NEXT: s_endpgm 3681; 3682; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: 3683; GFX10-WGP: ; %bb.0: ; %entry 3684; GFX10-WGP-NEXT: s_clause 0x1 3685; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3686; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 3687; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3688; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 3689; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 3690; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 3691; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 3692; GFX10-WGP-NEXT: s_endpgm 3693; 3694; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: 3695; GFX10-CU: ; %bb.0: ; %entry 3696; GFX10-CU-NEXT: s_clause 0x1 3697; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3698; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 3699; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3700; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 3701; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 3702; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 3703; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 3704; GFX10-CU-NEXT: s_endpgm 3705; 3706; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: 3707; SKIP-CACHE-INV: ; %bb.0: ; %entry 3708; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 3709; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 3710; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3711; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 3712; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 3713; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 3714; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 3715; SKIP-CACHE-INV-NEXT: s_endpgm 3716; 3717; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: 3718; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 3719; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3720; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 3721; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3722; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 3723; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 3724; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 3725; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 3726; 3727; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: 3728; GFX90A-TGSPLIT: ; %bb.0: ; %entry 3729; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3730; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 3731; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3732; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 3733; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 3734; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 3735; GFX90A-TGSPLIT-NEXT: s_endpgm 3736; 3737; 3738 i32* %out, i32 %in) { 3739entry: 3740 %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") monotonic 3741 ret void 3742} 3743 3744define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw( 3745; GFX7-LABEL: flat_wavefront_one_as_acquire_atomicrmw: 3746; GFX7: ; %bb.0: ; %entry 3747; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3748; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 3749; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3750; GFX7-NEXT: v_mov_b32_e32 v0, s0 3751; GFX7-NEXT: v_mov_b32_e32 v1, s1 3752; GFX7-NEXT: v_mov_b32_e32 v2, s2 3753; GFX7-NEXT: flat_atomic_swap v[0:1], v2 3754; GFX7-NEXT: s_endpgm 3755; 3756; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_atomicrmw: 3757; GFX10-WGP: ; %bb.0: ; %entry 3758; GFX10-WGP-NEXT: s_clause 0x1 3759; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3760; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 3761; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3762; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 3763; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 3764; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 3765; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 3766; GFX10-WGP-NEXT: s_endpgm 3767; 3768; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_atomicrmw: 3769; GFX10-CU: ; %bb.0: ; %entry 3770; GFX10-CU-NEXT: s_clause 0x1 3771; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3772; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 3773; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3774; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 3775; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 3776; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 3777; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 3778; GFX10-CU-NEXT: s_endpgm 3779; 3780; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_atomicrmw: 3781; SKIP-CACHE-INV: ; %bb.0: ; %entry 3782; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 3783; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 3784; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3785; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 3786; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 3787; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 3788; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 3789; SKIP-CACHE-INV-NEXT: s_endpgm 3790; 3791; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_atomicrmw: 3792; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 3793; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3794; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 3795; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3796; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 3797; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 3798; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 3799; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 3800; 3801; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_atomicrmw: 3802; GFX90A-TGSPLIT: ; %bb.0: ; %entry 3803; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3804; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 3805; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3806; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 3807; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 3808; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 3809; GFX90A-TGSPLIT-NEXT: s_endpgm 3810; 3811; 3812 i32* %out, i32 %in) { 3813entry: 3814 %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") acquire 3815 ret void 3816} 3817 3818define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw( 3819; GFX7-LABEL: flat_wavefront_one_as_release_atomicrmw: 3820; GFX7: ; %bb.0: ; %entry 3821; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3822; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 3823; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3824; GFX7-NEXT: v_mov_b32_e32 v0, s0 3825; GFX7-NEXT: v_mov_b32_e32 v1, s1 3826; GFX7-NEXT: v_mov_b32_e32 v2, s2 3827; GFX7-NEXT: flat_atomic_swap v[0:1], v2 3828; GFX7-NEXT: s_endpgm 3829; 3830; GFX10-WGP-LABEL: flat_wavefront_one_as_release_atomicrmw: 3831; GFX10-WGP: ; %bb.0: ; %entry 3832; GFX10-WGP-NEXT: s_clause 0x1 3833; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3834; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 3835; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3836; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 3837; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 3838; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 3839; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 3840; GFX10-WGP-NEXT: s_endpgm 3841; 3842; GFX10-CU-LABEL: flat_wavefront_one_as_release_atomicrmw: 3843; GFX10-CU: ; %bb.0: ; %entry 3844; GFX10-CU-NEXT: s_clause 0x1 3845; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3846; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 3847; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3848; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 3849; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 3850; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 3851; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 3852; GFX10-CU-NEXT: s_endpgm 3853; 3854; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_release_atomicrmw: 3855; SKIP-CACHE-INV: ; %bb.0: ; %entry 3856; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 3857; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 3858; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3859; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 3860; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 3861; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 3862; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 3863; SKIP-CACHE-INV-NEXT: s_endpgm 3864; 3865; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_atomicrmw: 3866; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 3867; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3868; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 3869; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3870; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 3871; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 3872; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 3873; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 3874; 3875; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_atomicrmw: 3876; GFX90A-TGSPLIT: ; %bb.0: ; %entry 3877; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3878; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 3879; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3880; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 3881; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 3882; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 3883; GFX90A-TGSPLIT-NEXT: s_endpgm 3884; 3885; 3886 i32* %out, i32 %in) { 3887entry: 3888 %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") release 3889 ret void 3890} 3891 3892define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw( 3893; GFX7-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: 3894; GFX7: ; %bb.0: ; %entry 3895; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3896; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 3897; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3898; GFX7-NEXT: v_mov_b32_e32 v0, s0 3899; GFX7-NEXT: v_mov_b32_e32 v1, s1 3900; GFX7-NEXT: v_mov_b32_e32 v2, s2 3901; GFX7-NEXT: flat_atomic_swap v[0:1], v2 3902; GFX7-NEXT: s_endpgm 3903; 3904; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: 3905; GFX10-WGP: ; %bb.0: ; %entry 3906; GFX10-WGP-NEXT: s_clause 0x1 3907; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3908; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 3909; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3910; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 3911; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 3912; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 3913; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 3914; GFX10-WGP-NEXT: s_endpgm 3915; 3916; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: 3917; GFX10-CU: ; %bb.0: ; %entry 3918; GFX10-CU-NEXT: s_clause 0x1 3919; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3920; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 3921; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3922; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 3923; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 3924; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 3925; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 3926; GFX10-CU-NEXT: s_endpgm 3927; 3928; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: 3929; SKIP-CACHE-INV: ; %bb.0: ; %entry 3930; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 3931; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 3932; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3933; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 3934; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 3935; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 3936; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 3937; SKIP-CACHE-INV-NEXT: s_endpgm 3938; 3939; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: 3940; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 3941; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3942; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 3943; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3944; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 3945; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 3946; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 3947; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 3948; 3949; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: 3950; GFX90A-TGSPLIT: ; %bb.0: ; %entry 3951; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3952; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 3953; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3954; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 3955; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 3956; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 3957; GFX90A-TGSPLIT-NEXT: s_endpgm 3958; 3959; 3960 i32* %out, i32 %in) { 3961entry: 3962 %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") acq_rel 3963 ret void 3964} 3965 3966define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw( 3967; GFX7-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: 3968; GFX7: ; %bb.0: ; %entry 3969; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3970; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 3971; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3972; GFX7-NEXT: v_mov_b32_e32 v0, s0 3973; GFX7-NEXT: v_mov_b32_e32 v1, s1 3974; GFX7-NEXT: v_mov_b32_e32 v2, s2 3975; GFX7-NEXT: flat_atomic_swap v[0:1], v2 3976; GFX7-NEXT: s_endpgm 3977; 3978; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: 3979; GFX10-WGP: ; %bb.0: ; %entry 3980; GFX10-WGP-NEXT: s_clause 0x1 3981; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3982; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 3983; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3984; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 3985; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 3986; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 3987; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 3988; GFX10-WGP-NEXT: s_endpgm 3989; 3990; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: 3991; GFX10-CU: ; %bb.0: ; %entry 3992; GFX10-CU-NEXT: s_clause 0x1 3993; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3994; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 3995; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3996; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 3997; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 3998; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 3999; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 4000; GFX10-CU-NEXT: s_endpgm 4001; 4002; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: 4003; SKIP-CACHE-INV: ; %bb.0: ; %entry 4004; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 4005; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 4006; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4007; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 4008; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 4009; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 4010; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 4011; SKIP-CACHE-INV-NEXT: s_endpgm 4012; 4013; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: 4014; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 4015; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4016; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 4017; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4018; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 4019; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 4020; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 4021; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 4022; 4023; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: 4024; GFX90A-TGSPLIT: ; %bb.0: ; %entry 4025; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4026; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 4027; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4028; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 4029; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 4030; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 4031; GFX90A-TGSPLIT-NEXT: s_endpgm 4032; 4033; 4034 i32* %out, i32 %in) { 4035entry: 4036 %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") seq_cst 4037 ret void 4038} 4039 4040define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( 4041; GFX7-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: 4042; GFX7: ; %bb.0: ; %entry 4043; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4044; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 4045; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4046; GFX7-NEXT: v_mov_b32_e32 v0, s0 4047; GFX7-NEXT: v_mov_b32_e32 v1, s1 4048; GFX7-NEXT: v_mov_b32_e32 v2, s2 4049; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 4050; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4051; GFX7-NEXT: flat_store_dword v[0:1], v2 4052; GFX7-NEXT: s_endpgm 4053; 4054; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: 4055; GFX10-WGP: ; %bb.0: ; %entry 4056; GFX10-WGP-NEXT: s_clause 0x1 4057; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4058; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 4059; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4060; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4061; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 4062; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 4063; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 4064; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4065; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 4066; GFX10-WGP-NEXT: s_endpgm 4067; 4068; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: 4069; GFX10-CU: ; %bb.0: ; %entry 4070; GFX10-CU-NEXT: s_clause 0x1 4071; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4072; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 4073; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4074; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 4075; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 4076; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 4077; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 4078; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4079; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 4080; GFX10-CU-NEXT: s_endpgm 4081; 4082; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: 4083; SKIP-CACHE-INV: ; %bb.0: ; %entry 4084; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 4085; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 4086; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4087; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 4088; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 4089; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 4090; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 4091; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4092; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 4093; SKIP-CACHE-INV-NEXT: s_endpgm 4094; 4095; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: 4096; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 4097; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4098; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 4099; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4100; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 4101; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 4102; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 4103; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4104; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 4105; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 4106; 4107; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: 4108; GFX90A-TGSPLIT: ; %bb.0: ; %entry 4109; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4110; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 4111; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4112; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 4113; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 4114; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 4115; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 4116; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 4117; GFX90A-TGSPLIT-NEXT: s_endpgm 4118; 4119; 4120 i32* %out, i32 %in) { 4121entry: 4122 %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") acquire 4123 store i32 %val, i32* %out, align 4 4124 ret void 4125} 4126 4127define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( 4128; GFX7-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: 4129; GFX7: ; %bb.0: ; %entry 4130; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4131; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 4132; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4133; GFX7-NEXT: v_mov_b32_e32 v0, s0 4134; GFX7-NEXT: v_mov_b32_e32 v1, s1 4135; GFX7-NEXT: v_mov_b32_e32 v2, s2 4136; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 4137; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4138; GFX7-NEXT: flat_store_dword v[0:1], v2 4139; GFX7-NEXT: s_endpgm 4140; 4141; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: 4142; GFX10-WGP: ; %bb.0: ; %entry 4143; GFX10-WGP-NEXT: s_clause 0x1 4144; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4145; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 4146; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4147; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4148; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 4149; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 4150; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 4151; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4152; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 4153; GFX10-WGP-NEXT: s_endpgm 4154; 4155; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: 4156; GFX10-CU: ; %bb.0: ; %entry 4157; GFX10-CU-NEXT: s_clause 0x1 4158; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4159; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 4160; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4161; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 4162; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 4163; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 4164; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 4165; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4166; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 4167; GFX10-CU-NEXT: s_endpgm 4168; 4169; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: 4170; SKIP-CACHE-INV: ; %bb.0: ; %entry 4171; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 4172; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 4173; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4174; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 4175; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 4176; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 4177; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 4178; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4179; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 4180; SKIP-CACHE-INV-NEXT: s_endpgm 4181; 4182; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: 4183; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 4184; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4185; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 4186; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4187; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 4188; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 4189; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 4190; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4191; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 4192; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 4193; 4194; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: 4195; GFX90A-TGSPLIT: ; %bb.0: ; %entry 4196; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4197; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 4198; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4199; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 4200; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 4201; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 4202; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 4203; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 4204; GFX90A-TGSPLIT-NEXT: s_endpgm 4205; 4206; 4207 i32* %out, i32 %in) { 4208entry: 4209 %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") acq_rel 4210 store i32 %val, i32* %out, align 4 4211 ret void 4212} 4213 4214define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( 4215; GFX7-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: 4216; GFX7: ; %bb.0: ; %entry 4217; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4218; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 4219; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4220; GFX7-NEXT: v_mov_b32_e32 v0, s0 4221; GFX7-NEXT: v_mov_b32_e32 v1, s1 4222; GFX7-NEXT: v_mov_b32_e32 v2, s2 4223; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 4224; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4225; GFX7-NEXT: flat_store_dword v[0:1], v2 4226; GFX7-NEXT: s_endpgm 4227; 4228; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: 4229; GFX10-WGP: ; %bb.0: ; %entry 4230; GFX10-WGP-NEXT: s_clause 0x1 4231; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4232; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 4233; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4234; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4235; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 4236; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 4237; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 4238; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4239; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 4240; GFX10-WGP-NEXT: s_endpgm 4241; 4242; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: 4243; GFX10-CU: ; %bb.0: ; %entry 4244; GFX10-CU-NEXT: s_clause 0x1 4245; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4246; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 4247; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4248; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 4249; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 4250; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 4251; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 4252; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4253; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 4254; GFX10-CU-NEXT: s_endpgm 4255; 4256; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: 4257; SKIP-CACHE-INV: ; %bb.0: ; %entry 4258; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 4259; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 4260; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4261; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 4262; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 4263; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 4264; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 4265; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4266; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 4267; SKIP-CACHE-INV-NEXT: s_endpgm 4268; 4269; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: 4270; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 4271; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4272; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 4273; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4274; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 4275; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 4276; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 4277; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4278; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 4279; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 4280; 4281; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: 4282; GFX90A-TGSPLIT: ; %bb.0: ; %entry 4283; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4284; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 4285; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4286; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 4287; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 4288; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 4289; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 4290; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 4291; GFX90A-TGSPLIT-NEXT: s_endpgm 4292; 4293; 4294 i32* %out, i32 %in) { 4295entry: 4296 %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") seq_cst 4297 store i32 %val, i32* %out, align 4 4298 ret void 4299} 4300 4301define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg( 4302; GFX7-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: 4303; GFX7: ; %bb.0: ; %entry 4304; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4305; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 4306; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4307; GFX7-NEXT: s_add_u32 s0, s0, 16 4308; GFX7-NEXT: s_addc_u32 s1, s1, 0 4309; GFX7-NEXT: v_mov_b32_e32 v0, s0 4310; GFX7-NEXT: v_mov_b32_e32 v2, s2 4311; GFX7-NEXT: v_mov_b32_e32 v1, s1 4312; GFX7-NEXT: v_mov_b32_e32 v3, s3 4313; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4314; GFX7-NEXT: s_endpgm 4315; 4316; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: 4317; GFX10-WGP: ; %bb.0: ; %entry 4318; GFX10-WGP-NEXT: s_clause 0x1 4319; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4320; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4321; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4322; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 4323; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 4324; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4325; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 4326; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 4327; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 4328; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4329; GFX10-WGP-NEXT: s_endpgm 4330; 4331; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: 4332; GFX10-CU: ; %bb.0: ; %entry 4333; GFX10-CU-NEXT: s_clause 0x1 4334; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4335; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4336; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4337; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 4338; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 4339; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 4340; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 4341; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 4342; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 4343; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4344; GFX10-CU-NEXT: s_endpgm 4345; 4346; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: 4347; SKIP-CACHE-INV: ; %bb.0: ; %entry 4348; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 4349; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4350; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4351; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 4352; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 4353; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 4354; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 4355; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 4356; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 4357; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4358; SKIP-CACHE-INV-NEXT: s_endpgm 4359; 4360; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: 4361; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 4362; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4363; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4364; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4365; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 4366; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 4367; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 4368; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 4369; 4370; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: 4371; GFX90A-TGSPLIT: ; %bb.0: ; %entry 4372; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4373; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4374; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4375; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 4376; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 4377; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 4378; GFX90A-TGSPLIT-NEXT: s_endpgm 4379; 4380; 4381 i32* %out, i32 %in, i32 %old) { 4382entry: 4383 %gep = getelementptr i32, i32* %out, i32 4 4384 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic monotonic 4385 ret void 4386} 4387 4388define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( 4389; GFX7-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: 4390; GFX7: ; %bb.0: ; %entry 4391; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4392; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 4393; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4394; GFX7-NEXT: s_add_u32 s0, s0, 16 4395; GFX7-NEXT: s_addc_u32 s1, s1, 0 4396; GFX7-NEXT: v_mov_b32_e32 v0, s0 4397; GFX7-NEXT: v_mov_b32_e32 v2, s2 4398; GFX7-NEXT: v_mov_b32_e32 v1, s1 4399; GFX7-NEXT: v_mov_b32_e32 v3, s3 4400; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4401; GFX7-NEXT: s_endpgm 4402; 4403; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: 4404; GFX10-WGP: ; %bb.0: ; %entry 4405; GFX10-WGP-NEXT: s_clause 0x1 4406; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4407; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4408; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4409; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 4410; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 4411; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4412; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 4413; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 4414; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 4415; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4416; GFX10-WGP-NEXT: s_endpgm 4417; 4418; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: 4419; GFX10-CU: ; %bb.0: ; %entry 4420; GFX10-CU-NEXT: s_clause 0x1 4421; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4422; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4423; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4424; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 4425; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 4426; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 4427; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 4428; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 4429; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 4430; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4431; GFX10-CU-NEXT: s_endpgm 4432; 4433; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: 4434; SKIP-CACHE-INV: ; %bb.0: ; %entry 4435; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 4436; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4437; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4438; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 4439; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 4440; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 4441; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 4442; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 4443; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 4444; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4445; SKIP-CACHE-INV-NEXT: s_endpgm 4446; 4447; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: 4448; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 4449; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4450; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4451; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4452; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 4453; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 4454; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 4455; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 4456; 4457; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: 4458; GFX90A-TGSPLIT: ; %bb.0: ; %entry 4459; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4460; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4461; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4462; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 4463; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 4464; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 4465; GFX90A-TGSPLIT-NEXT: s_endpgm 4466; 4467; 4468 i32* %out, i32 %in, i32 %old) { 4469entry: 4470 %gep = getelementptr i32, i32* %out, i32 4 4471 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire monotonic 4472 ret void 4473} 4474 4475define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( 4476; GFX7-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: 4477; GFX7: ; %bb.0: ; %entry 4478; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4479; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 4480; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4481; GFX7-NEXT: s_add_u32 s0, s0, 16 4482; GFX7-NEXT: s_addc_u32 s1, s1, 0 4483; GFX7-NEXT: v_mov_b32_e32 v0, s0 4484; GFX7-NEXT: v_mov_b32_e32 v2, s2 4485; GFX7-NEXT: v_mov_b32_e32 v1, s1 4486; GFX7-NEXT: v_mov_b32_e32 v3, s3 4487; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4488; GFX7-NEXT: s_endpgm 4489; 4490; GFX10-WGP-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: 4491; GFX10-WGP: ; %bb.0: ; %entry 4492; GFX10-WGP-NEXT: s_clause 0x1 4493; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4494; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4495; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4496; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 4497; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 4498; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4499; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 4500; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 4501; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 4502; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4503; GFX10-WGP-NEXT: s_endpgm 4504; 4505; GFX10-CU-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: 4506; GFX10-CU: ; %bb.0: ; %entry 4507; GFX10-CU-NEXT: s_clause 0x1 4508; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4509; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4510; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4511; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 4512; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 4513; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 4514; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 4515; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 4516; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 4517; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4518; GFX10-CU-NEXT: s_endpgm 4519; 4520; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: 4521; SKIP-CACHE-INV: ; %bb.0: ; %entry 4522; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 4523; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4524; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4525; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 4526; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 4527; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 4528; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 4529; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 4530; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 4531; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4532; SKIP-CACHE-INV-NEXT: s_endpgm 4533; 4534; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: 4535; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 4536; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4537; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4538; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4539; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 4540; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 4541; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 4542; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 4543; 4544; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: 4545; GFX90A-TGSPLIT: ; %bb.0: ; %entry 4546; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4547; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4548; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4549; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 4550; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 4551; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 4552; GFX90A-TGSPLIT-NEXT: s_endpgm 4553; 4554; 4555 i32* %out, i32 %in, i32 %old) { 4556entry: 4557 %gep = getelementptr i32, i32* %out, i32 4 4558 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release monotonic 4559 ret void 4560} 4561 4562define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( 4563; GFX7-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: 4564; GFX7: ; %bb.0: ; %entry 4565; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4566; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 4567; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4568; GFX7-NEXT: s_add_u32 s0, s0, 16 4569; GFX7-NEXT: s_addc_u32 s1, s1, 0 4570; GFX7-NEXT: v_mov_b32_e32 v0, s0 4571; GFX7-NEXT: v_mov_b32_e32 v2, s2 4572; GFX7-NEXT: v_mov_b32_e32 v1, s1 4573; GFX7-NEXT: v_mov_b32_e32 v3, s3 4574; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4575; GFX7-NEXT: s_endpgm 4576; 4577; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: 4578; GFX10-WGP: ; %bb.0: ; %entry 4579; GFX10-WGP-NEXT: s_clause 0x1 4580; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4581; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4582; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4583; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 4584; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 4585; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4586; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 4587; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 4588; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 4589; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4590; GFX10-WGP-NEXT: s_endpgm 4591; 4592; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: 4593; GFX10-CU: ; %bb.0: ; %entry 4594; GFX10-CU-NEXT: s_clause 0x1 4595; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4596; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4597; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4598; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 4599; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 4600; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 4601; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 4602; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 4603; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 4604; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4605; GFX10-CU-NEXT: s_endpgm 4606; 4607; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: 4608; SKIP-CACHE-INV: ; %bb.0: ; %entry 4609; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 4610; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4611; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4612; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 4613; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 4614; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 4615; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 4616; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 4617; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 4618; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4619; SKIP-CACHE-INV-NEXT: s_endpgm 4620; 4621; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: 4622; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 4623; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4624; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4625; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4626; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 4627; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 4628; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 4629; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 4630; 4631; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: 4632; GFX90A-TGSPLIT: ; %bb.0: ; %entry 4633; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4634; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4635; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4636; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 4637; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 4638; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 4639; GFX90A-TGSPLIT-NEXT: s_endpgm 4640; 4641; 4642 i32* %out, i32 %in, i32 %old) { 4643entry: 4644 %gep = getelementptr i32, i32* %out, i32 4 4645 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel monotonic 4646 ret void 4647} 4648 4649define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( 4650; GFX7-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: 4651; GFX7: ; %bb.0: ; %entry 4652; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4653; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 4654; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4655; GFX7-NEXT: s_add_u32 s0, s0, 16 4656; GFX7-NEXT: s_addc_u32 s1, s1, 0 4657; GFX7-NEXT: v_mov_b32_e32 v0, s0 4658; GFX7-NEXT: v_mov_b32_e32 v2, s2 4659; GFX7-NEXT: v_mov_b32_e32 v1, s1 4660; GFX7-NEXT: v_mov_b32_e32 v3, s3 4661; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4662; GFX7-NEXT: s_endpgm 4663; 4664; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: 4665; GFX10-WGP: ; %bb.0: ; %entry 4666; GFX10-WGP-NEXT: s_clause 0x1 4667; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4668; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4669; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4670; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 4671; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 4672; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4673; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 4674; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 4675; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 4676; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4677; GFX10-WGP-NEXT: s_endpgm 4678; 4679; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: 4680; GFX10-CU: ; %bb.0: ; %entry 4681; GFX10-CU-NEXT: s_clause 0x1 4682; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4683; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4684; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4685; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 4686; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 4687; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 4688; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 4689; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 4690; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 4691; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4692; GFX10-CU-NEXT: s_endpgm 4693; 4694; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: 4695; SKIP-CACHE-INV: ; %bb.0: ; %entry 4696; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 4697; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4698; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4699; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 4700; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 4701; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 4702; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 4703; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 4704; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 4705; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4706; SKIP-CACHE-INV-NEXT: s_endpgm 4707; 4708; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: 4709; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 4710; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4711; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4712; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4713; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 4714; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 4715; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 4716; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 4717; 4718; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: 4719; GFX90A-TGSPLIT: ; %bb.0: ; %entry 4720; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4721; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4722; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4723; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 4724; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 4725; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 4726; GFX90A-TGSPLIT-NEXT: s_endpgm 4727; 4728; 4729 i32* %out, i32 %in, i32 %old) { 4730entry: 4731 %gep = getelementptr i32, i32* %out, i32 4 4732 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst monotonic 4733 ret void 4734} 4735 4736define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( 4737; GFX7-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: 4738; GFX7: ; %bb.0: ; %entry 4739; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4740; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 4741; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4742; GFX7-NEXT: s_add_u32 s0, s0, 16 4743; GFX7-NEXT: s_addc_u32 s1, s1, 0 4744; GFX7-NEXT: v_mov_b32_e32 v0, s0 4745; GFX7-NEXT: v_mov_b32_e32 v2, s2 4746; GFX7-NEXT: v_mov_b32_e32 v1, s1 4747; GFX7-NEXT: v_mov_b32_e32 v3, s3 4748; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4749; GFX7-NEXT: s_endpgm 4750; 4751; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: 4752; GFX10-WGP: ; %bb.0: ; %entry 4753; GFX10-WGP-NEXT: s_clause 0x1 4754; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4755; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4756; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4757; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 4758; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 4759; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4760; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 4761; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 4762; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 4763; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4764; GFX10-WGP-NEXT: s_endpgm 4765; 4766; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: 4767; GFX10-CU: ; %bb.0: ; %entry 4768; GFX10-CU-NEXT: s_clause 0x1 4769; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4770; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4771; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4772; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 4773; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 4774; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 4775; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 4776; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 4777; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 4778; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4779; GFX10-CU-NEXT: s_endpgm 4780; 4781; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: 4782; SKIP-CACHE-INV: ; %bb.0: ; %entry 4783; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 4784; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4785; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4786; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 4787; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 4788; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 4789; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 4790; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 4791; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 4792; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4793; SKIP-CACHE-INV-NEXT: s_endpgm 4794; 4795; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: 4796; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 4797; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4798; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4799; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4800; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 4801; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 4802; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 4803; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 4804; 4805; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: 4806; GFX90A-TGSPLIT: ; %bb.0: ; %entry 4807; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4808; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4809; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4810; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 4811; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 4812; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 4813; GFX90A-TGSPLIT-NEXT: s_endpgm 4814; 4815; 4816 i32* %out, i32 %in, i32 %old) { 4817entry: 4818 %gep = getelementptr i32, i32* %out, i32 4 4819 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire acquire 4820 ret void 4821} 4822 4823define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( 4824; GFX7-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: 4825; GFX7: ; %bb.0: ; %entry 4826; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4827; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 4828; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4829; GFX7-NEXT: s_add_u32 s0, s0, 16 4830; GFX7-NEXT: s_addc_u32 s1, s1, 0 4831; GFX7-NEXT: v_mov_b32_e32 v0, s0 4832; GFX7-NEXT: v_mov_b32_e32 v2, s2 4833; GFX7-NEXT: v_mov_b32_e32 v1, s1 4834; GFX7-NEXT: v_mov_b32_e32 v3, s3 4835; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4836; GFX7-NEXT: s_endpgm 4837; 4838; GFX10-WGP-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: 4839; GFX10-WGP: ; %bb.0: ; %entry 4840; GFX10-WGP-NEXT: s_clause 0x1 4841; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4842; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4843; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4844; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 4845; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 4846; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4847; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 4848; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 4849; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 4850; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4851; GFX10-WGP-NEXT: s_endpgm 4852; 4853; GFX10-CU-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: 4854; GFX10-CU: ; %bb.0: ; %entry 4855; GFX10-CU-NEXT: s_clause 0x1 4856; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4857; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4858; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4859; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 4860; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 4861; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 4862; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 4863; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 4864; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 4865; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4866; GFX10-CU-NEXT: s_endpgm 4867; 4868; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: 4869; SKIP-CACHE-INV: ; %bb.0: ; %entry 4870; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 4871; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4872; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4873; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 4874; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 4875; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 4876; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 4877; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 4878; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 4879; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4880; SKIP-CACHE-INV-NEXT: s_endpgm 4881; 4882; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: 4883; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 4884; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4885; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4886; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4887; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 4888; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 4889; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 4890; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 4891; 4892; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: 4893; GFX90A-TGSPLIT: ; %bb.0: ; %entry 4894; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4895; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4896; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4897; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 4898; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 4899; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 4900; GFX90A-TGSPLIT-NEXT: s_endpgm 4901; 4902; 4903 i32* %out, i32 %in, i32 %old) { 4904entry: 4905 %gep = getelementptr i32, i32* %out, i32 4 4906 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release acquire 4907 ret void 4908} 4909 4910define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( 4911; GFX7-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: 4912; GFX7: ; %bb.0: ; %entry 4913; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4914; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 4915; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4916; GFX7-NEXT: s_add_u32 s0, s0, 16 4917; GFX7-NEXT: s_addc_u32 s1, s1, 0 4918; GFX7-NEXT: v_mov_b32_e32 v0, s0 4919; GFX7-NEXT: v_mov_b32_e32 v2, s2 4920; GFX7-NEXT: v_mov_b32_e32 v1, s1 4921; GFX7-NEXT: v_mov_b32_e32 v3, s3 4922; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4923; GFX7-NEXT: s_endpgm 4924; 4925; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: 4926; GFX10-WGP: ; %bb.0: ; %entry 4927; GFX10-WGP-NEXT: s_clause 0x1 4928; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4929; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4930; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4931; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 4932; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 4933; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4934; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 4935; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 4936; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 4937; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4938; GFX10-WGP-NEXT: s_endpgm 4939; 4940; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: 4941; GFX10-CU: ; %bb.0: ; %entry 4942; GFX10-CU-NEXT: s_clause 0x1 4943; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4944; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4945; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4946; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 4947; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 4948; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 4949; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 4950; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 4951; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 4952; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4953; GFX10-CU-NEXT: s_endpgm 4954; 4955; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: 4956; SKIP-CACHE-INV: ; %bb.0: ; %entry 4957; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 4958; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4959; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4960; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 4961; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 4962; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 4963; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 4964; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 4965; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 4966; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4967; SKIP-CACHE-INV-NEXT: s_endpgm 4968; 4969; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: 4970; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 4971; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4972; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4973; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4974; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 4975; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 4976; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 4977; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 4978; 4979; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: 4980; GFX90A-TGSPLIT: ; %bb.0: ; %entry 4981; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4982; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4983; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4984; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 4985; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 4986; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 4987; GFX90A-TGSPLIT-NEXT: s_endpgm 4988; 4989; 4990 i32* %out, i32 %in, i32 %old) { 4991entry: 4992 %gep = getelementptr i32, i32* %out, i32 4 4993 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel acquire 4994 ret void 4995} 4996 4997define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( 4998; GFX7-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: 4999; GFX7: ; %bb.0: ; %entry 5000; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5001; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 5002; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5003; GFX7-NEXT: s_add_u32 s0, s0, 16 5004; GFX7-NEXT: s_addc_u32 s1, s1, 0 5005; GFX7-NEXT: v_mov_b32_e32 v0, s0 5006; GFX7-NEXT: v_mov_b32_e32 v2, s2 5007; GFX7-NEXT: v_mov_b32_e32 v1, s1 5008; GFX7-NEXT: v_mov_b32_e32 v3, s3 5009; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 5010; GFX7-NEXT: s_endpgm 5011; 5012; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: 5013; GFX10-WGP: ; %bb.0: ; %entry 5014; GFX10-WGP-NEXT: s_clause 0x1 5015; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5016; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5017; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 5018; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 5019; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 5020; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 5021; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 5022; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 5023; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 5024; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 5025; GFX10-WGP-NEXT: s_endpgm 5026; 5027; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: 5028; GFX10-CU: ; %bb.0: ; %entry 5029; GFX10-CU-NEXT: s_clause 0x1 5030; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5031; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5032; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5033; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 5034; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 5035; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 5036; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 5037; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 5038; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 5039; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 5040; GFX10-CU-NEXT: s_endpgm 5041; 5042; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: 5043; SKIP-CACHE-INV: ; %bb.0: ; %entry 5044; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 5045; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 5046; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5047; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 5048; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 5049; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 5050; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 5051; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 5052; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 5053; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 5054; SKIP-CACHE-INV-NEXT: s_endpgm 5055; 5056; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: 5057; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 5058; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5059; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5060; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5061; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 5062; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 5063; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 5064; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 5065; 5066; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: 5067; GFX90A-TGSPLIT: ; %bb.0: ; %entry 5068; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5069; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5070; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5071; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 5072; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 5073; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 5074; GFX90A-TGSPLIT-NEXT: s_endpgm 5075; 5076; 5077 i32* %out, i32 %in, i32 %old) { 5078entry: 5079 %gep = getelementptr i32, i32* %out, i32 4 5080 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst acquire 5081 ret void 5082} 5083 5084define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( 5085; GFX7-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: 5086; GFX7: ; %bb.0: ; %entry 5087; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5088; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 5089; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5090; GFX7-NEXT: s_add_u32 s0, s0, 16 5091; GFX7-NEXT: s_addc_u32 s1, s1, 0 5092; GFX7-NEXT: v_mov_b32_e32 v0, s0 5093; GFX7-NEXT: v_mov_b32_e32 v2, s2 5094; GFX7-NEXT: v_mov_b32_e32 v1, s1 5095; GFX7-NEXT: v_mov_b32_e32 v3, s3 5096; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 5097; GFX7-NEXT: s_endpgm 5098; 5099; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: 5100; GFX10-WGP: ; %bb.0: ; %entry 5101; GFX10-WGP-NEXT: s_clause 0x1 5102; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5103; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5104; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 5105; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 5106; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 5107; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 5108; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 5109; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 5110; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 5111; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 5112; GFX10-WGP-NEXT: s_endpgm 5113; 5114; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: 5115; GFX10-CU: ; %bb.0: ; %entry 5116; GFX10-CU-NEXT: s_clause 0x1 5117; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5118; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5119; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5120; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 5121; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 5122; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 5123; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 5124; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 5125; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 5126; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 5127; GFX10-CU-NEXT: s_endpgm 5128; 5129; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: 5130; SKIP-CACHE-INV: ; %bb.0: ; %entry 5131; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 5132; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 5133; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5134; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 5135; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 5136; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 5137; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 5138; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 5139; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 5140; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 5141; SKIP-CACHE-INV-NEXT: s_endpgm 5142; 5143; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: 5144; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 5145; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5146; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5147; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5148; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 5149; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 5150; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 5151; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 5152; 5153; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: 5154; GFX90A-TGSPLIT: ; %bb.0: ; %entry 5155; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5156; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5157; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5158; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 5159; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 5160; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 5161; GFX90A-TGSPLIT-NEXT: s_endpgm 5162; 5163; 5164 i32* %out, i32 %in, i32 %old) { 5165entry: 5166 %gep = getelementptr i32, i32* %out, i32 4 5167 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst seq_cst 5168 ret void 5169} 5170 5171define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( 5172; GFX7-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: 5173; GFX7: ; %bb.0: ; %entry 5174; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5175; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 5176; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5177; GFX7-NEXT: s_add_u32 s4, s0, 16 5178; GFX7-NEXT: s_addc_u32 s5, s1, 0 5179; GFX7-NEXT: v_mov_b32_e32 v0, s4 5180; GFX7-NEXT: v_mov_b32_e32 v2, s2 5181; GFX7-NEXT: v_mov_b32_e32 v1, s5 5182; GFX7-NEXT: v_mov_b32_e32 v3, s3 5183; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5184; GFX7-NEXT: v_mov_b32_e32 v0, s0 5185; GFX7-NEXT: v_mov_b32_e32 v1, s1 5186; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5187; GFX7-NEXT: flat_store_dword v[0:1], v2 5188; GFX7-NEXT: s_endpgm 5189; 5190; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: 5191; GFX10-WGP: ; %bb.0: ; %entry 5192; GFX10-WGP-NEXT: s_clause 0x1 5193; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5194; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5195; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 5196; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 5197; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 5198; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 5199; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 5200; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 5201; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 5202; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5203; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 5204; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 5205; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5206; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 5207; GFX10-WGP-NEXT: s_endpgm 5208; 5209; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: 5210; GFX10-CU: ; %bb.0: ; %entry 5211; GFX10-CU-NEXT: s_clause 0x1 5212; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5213; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5214; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5215; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 5216; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 5217; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 5218; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 5219; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 5220; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 5221; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5222; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 5223; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 5224; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5225; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 5226; GFX10-CU-NEXT: s_endpgm 5227; 5228; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: 5229; SKIP-CACHE-INV: ; %bb.0: ; %entry 5230; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 5231; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 5232; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5233; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 5234; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 5235; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 5236; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 5237; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 5238; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 5239; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5240; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 5241; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 5242; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5243; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 5244; SKIP-CACHE-INV-NEXT: s_endpgm 5245; 5246; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: 5247; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 5248; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5249; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5250; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5251; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 5252; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 5253; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 5254; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5255; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 5256; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 5257; 5258; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: 5259; GFX90A-TGSPLIT: ; %bb.0: ; %entry 5260; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5261; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5262; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5263; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 5264; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 5265; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 5266; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 5267; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 5268; GFX90A-TGSPLIT-NEXT: s_endpgm 5269; 5270; 5271 i32* %out, i32 %in, i32 %old) { 5272entry: 5273 %gep = getelementptr i32, i32* %out, i32 4 5274 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire monotonic 5275 %val0 = extractvalue { i32, i1 } %val, 0 5276 store i32 %val0, i32* %out, align 4 5277 ret void 5278} 5279 5280define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( 5281; GFX7-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: 5282; GFX7: ; %bb.0: ; %entry 5283; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5284; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 5285; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5286; GFX7-NEXT: s_add_u32 s4, s0, 16 5287; GFX7-NEXT: s_addc_u32 s5, s1, 0 5288; GFX7-NEXT: v_mov_b32_e32 v0, s4 5289; GFX7-NEXT: v_mov_b32_e32 v2, s2 5290; GFX7-NEXT: v_mov_b32_e32 v1, s5 5291; GFX7-NEXT: v_mov_b32_e32 v3, s3 5292; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5293; GFX7-NEXT: v_mov_b32_e32 v0, s0 5294; GFX7-NEXT: v_mov_b32_e32 v1, s1 5295; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5296; GFX7-NEXT: flat_store_dword v[0:1], v2 5297; GFX7-NEXT: s_endpgm 5298; 5299; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: 5300; GFX10-WGP: ; %bb.0: ; %entry 5301; GFX10-WGP-NEXT: s_clause 0x1 5302; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5303; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5304; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 5305; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 5306; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 5307; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 5308; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 5309; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 5310; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 5311; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5312; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 5313; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 5314; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5315; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 5316; GFX10-WGP-NEXT: s_endpgm 5317; 5318; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: 5319; GFX10-CU: ; %bb.0: ; %entry 5320; GFX10-CU-NEXT: s_clause 0x1 5321; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5322; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5323; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5324; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 5325; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 5326; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 5327; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 5328; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 5329; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 5330; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5331; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 5332; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 5333; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5334; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 5335; GFX10-CU-NEXT: s_endpgm 5336; 5337; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: 5338; SKIP-CACHE-INV: ; %bb.0: ; %entry 5339; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 5340; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 5341; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5342; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 5343; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 5344; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 5345; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 5346; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 5347; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 5348; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5349; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 5350; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 5351; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5352; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 5353; SKIP-CACHE-INV-NEXT: s_endpgm 5354; 5355; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: 5356; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 5357; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5358; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5359; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5360; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 5361; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 5362; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 5363; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5364; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 5365; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 5366; 5367; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: 5368; GFX90A-TGSPLIT: ; %bb.0: ; %entry 5369; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5370; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5371; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5372; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 5373; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 5374; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 5375; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 5376; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 5377; GFX90A-TGSPLIT-NEXT: s_endpgm 5378; 5379; 5380 i32* %out, i32 %in, i32 %old) { 5381entry: 5382 %gep = getelementptr i32, i32* %out, i32 4 5383 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel monotonic 5384 %val0 = extractvalue { i32, i1 } %val, 0 5385 store i32 %val0, i32* %out, align 4 5386 ret void 5387} 5388 5389define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( 5390; GFX7-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: 5391; GFX7: ; %bb.0: ; %entry 5392; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5393; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 5394; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5395; GFX7-NEXT: s_add_u32 s4, s0, 16 5396; GFX7-NEXT: s_addc_u32 s5, s1, 0 5397; GFX7-NEXT: v_mov_b32_e32 v0, s4 5398; GFX7-NEXT: v_mov_b32_e32 v2, s2 5399; GFX7-NEXT: v_mov_b32_e32 v1, s5 5400; GFX7-NEXT: v_mov_b32_e32 v3, s3 5401; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5402; GFX7-NEXT: v_mov_b32_e32 v0, s0 5403; GFX7-NEXT: v_mov_b32_e32 v1, s1 5404; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5405; GFX7-NEXT: flat_store_dword v[0:1], v2 5406; GFX7-NEXT: s_endpgm 5407; 5408; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: 5409; GFX10-WGP: ; %bb.0: ; %entry 5410; GFX10-WGP-NEXT: s_clause 0x1 5411; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5412; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5413; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 5414; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 5415; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 5416; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 5417; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 5418; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 5419; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 5420; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5421; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 5422; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 5423; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5424; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 5425; GFX10-WGP-NEXT: s_endpgm 5426; 5427; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: 5428; GFX10-CU: ; %bb.0: ; %entry 5429; GFX10-CU-NEXT: s_clause 0x1 5430; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5431; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5432; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5433; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 5434; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 5435; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 5436; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 5437; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 5438; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 5439; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5440; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 5441; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 5442; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5443; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 5444; GFX10-CU-NEXT: s_endpgm 5445; 5446; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: 5447; SKIP-CACHE-INV: ; %bb.0: ; %entry 5448; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 5449; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 5450; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5451; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 5452; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 5453; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 5454; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 5455; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 5456; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 5457; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5458; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 5459; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 5460; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5461; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 5462; SKIP-CACHE-INV-NEXT: s_endpgm 5463; 5464; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: 5465; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 5466; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5467; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5468; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5469; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 5470; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 5471; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 5472; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5473; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 5474; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 5475; 5476; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: 5477; GFX90A-TGSPLIT: ; %bb.0: ; %entry 5478; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5479; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5480; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5481; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 5482; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 5483; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 5484; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 5485; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 5486; GFX90A-TGSPLIT-NEXT: s_endpgm 5487; 5488; 5489 i32* %out, i32 %in, i32 %old) { 5490entry: 5491 %gep = getelementptr i32, i32* %out, i32 4 5492 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst monotonic 5493 %val0 = extractvalue { i32, i1 } %val, 0 5494 store i32 %val0, i32* %out, align 4 5495 ret void 5496} 5497 5498define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( 5499; GFX7-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: 5500; GFX7: ; %bb.0: ; %entry 5501; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5502; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 5503; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5504; GFX7-NEXT: s_add_u32 s4, s0, 16 5505; GFX7-NEXT: s_addc_u32 s5, s1, 0 5506; GFX7-NEXT: v_mov_b32_e32 v0, s4 5507; GFX7-NEXT: v_mov_b32_e32 v2, s2 5508; GFX7-NEXT: v_mov_b32_e32 v1, s5 5509; GFX7-NEXT: v_mov_b32_e32 v3, s3 5510; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5511; GFX7-NEXT: v_mov_b32_e32 v0, s0 5512; GFX7-NEXT: v_mov_b32_e32 v1, s1 5513; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5514; GFX7-NEXT: flat_store_dword v[0:1], v2 5515; GFX7-NEXT: s_endpgm 5516; 5517; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: 5518; GFX10-WGP: ; %bb.0: ; %entry 5519; GFX10-WGP-NEXT: s_clause 0x1 5520; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5521; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5522; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 5523; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 5524; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 5525; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 5526; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 5527; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 5528; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 5529; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5530; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 5531; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 5532; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5533; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 5534; GFX10-WGP-NEXT: s_endpgm 5535; 5536; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: 5537; GFX10-CU: ; %bb.0: ; %entry 5538; GFX10-CU-NEXT: s_clause 0x1 5539; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5540; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5541; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5542; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 5543; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 5544; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 5545; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 5546; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 5547; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 5548; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5549; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 5550; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 5551; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5552; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 5553; GFX10-CU-NEXT: s_endpgm 5554; 5555; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: 5556; SKIP-CACHE-INV: ; %bb.0: ; %entry 5557; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 5558; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 5559; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5560; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 5561; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 5562; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 5563; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 5564; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 5565; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 5566; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5567; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 5568; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 5569; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5570; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 5571; SKIP-CACHE-INV-NEXT: s_endpgm 5572; 5573; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: 5574; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 5575; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5576; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5577; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5578; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 5579; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 5580; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 5581; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5582; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 5583; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 5584; 5585; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: 5586; GFX90A-TGSPLIT: ; %bb.0: ; %entry 5587; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5588; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5589; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5590; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 5591; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 5592; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 5593; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 5594; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 5595; GFX90A-TGSPLIT-NEXT: s_endpgm 5596; 5597; 5598 i32* %out, i32 %in, i32 %old) { 5599entry: 5600 %gep = getelementptr i32, i32* %out, i32 4 5601 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire acquire 5602 %val0 = extractvalue { i32, i1 } %val, 0 5603 store i32 %val0, i32* %out, align 4 5604 ret void 5605} 5606 5607define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( 5608; GFX7-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: 5609; GFX7: ; %bb.0: ; %entry 5610; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5611; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 5612; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5613; GFX7-NEXT: s_add_u32 s4, s0, 16 5614; GFX7-NEXT: s_addc_u32 s5, s1, 0 5615; GFX7-NEXT: v_mov_b32_e32 v0, s4 5616; GFX7-NEXT: v_mov_b32_e32 v2, s2 5617; GFX7-NEXT: v_mov_b32_e32 v1, s5 5618; GFX7-NEXT: v_mov_b32_e32 v3, s3 5619; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5620; GFX7-NEXT: v_mov_b32_e32 v0, s0 5621; GFX7-NEXT: v_mov_b32_e32 v1, s1 5622; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5623; GFX7-NEXT: flat_store_dword v[0:1], v2 5624; GFX7-NEXT: s_endpgm 5625; 5626; GFX10-WGP-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: 5627; GFX10-WGP: ; %bb.0: ; %entry 5628; GFX10-WGP-NEXT: s_clause 0x1 5629; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5630; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5631; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 5632; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 5633; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 5634; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 5635; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 5636; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 5637; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 5638; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5639; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 5640; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 5641; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5642; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 5643; GFX10-WGP-NEXT: s_endpgm 5644; 5645; GFX10-CU-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: 5646; GFX10-CU: ; %bb.0: ; %entry 5647; GFX10-CU-NEXT: s_clause 0x1 5648; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5649; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5650; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5651; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 5652; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 5653; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 5654; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 5655; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 5656; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 5657; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5658; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 5659; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 5660; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5661; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 5662; GFX10-CU-NEXT: s_endpgm 5663; 5664; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: 5665; SKIP-CACHE-INV: ; %bb.0: ; %entry 5666; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 5667; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 5668; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5669; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 5670; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 5671; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 5672; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 5673; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 5674; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 5675; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5676; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 5677; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 5678; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5679; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 5680; SKIP-CACHE-INV-NEXT: s_endpgm 5681; 5682; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: 5683; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 5684; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5685; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5686; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5687; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 5688; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 5689; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 5690; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5691; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 5692; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 5693; 5694; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: 5695; GFX90A-TGSPLIT: ; %bb.0: ; %entry 5696; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5697; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5698; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5699; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 5700; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 5701; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 5702; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 5703; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 5704; GFX90A-TGSPLIT-NEXT: s_endpgm 5705; 5706; 5707 i32* %out, i32 %in, i32 %old) { 5708entry: 5709 %gep = getelementptr i32, i32* %out, i32 4 5710 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release acquire 5711 %val0 = extractvalue { i32, i1 } %val, 0 5712 store i32 %val0, i32* %out, align 4 5713 ret void 5714} 5715 5716define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( 5717; GFX7-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: 5718; GFX7: ; %bb.0: ; %entry 5719; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5720; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 5721; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5722; GFX7-NEXT: s_add_u32 s4, s0, 16 5723; GFX7-NEXT: s_addc_u32 s5, s1, 0 5724; GFX7-NEXT: v_mov_b32_e32 v0, s4 5725; GFX7-NEXT: v_mov_b32_e32 v2, s2 5726; GFX7-NEXT: v_mov_b32_e32 v1, s5 5727; GFX7-NEXT: v_mov_b32_e32 v3, s3 5728; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5729; GFX7-NEXT: v_mov_b32_e32 v0, s0 5730; GFX7-NEXT: v_mov_b32_e32 v1, s1 5731; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5732; GFX7-NEXT: flat_store_dword v[0:1], v2 5733; GFX7-NEXT: s_endpgm 5734; 5735; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: 5736; GFX10-WGP: ; %bb.0: ; %entry 5737; GFX10-WGP-NEXT: s_clause 0x1 5738; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5739; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5740; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 5741; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 5742; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 5743; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 5744; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 5745; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 5746; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 5747; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5748; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 5749; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 5750; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5751; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 5752; GFX10-WGP-NEXT: s_endpgm 5753; 5754; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: 5755; GFX10-CU: ; %bb.0: ; %entry 5756; GFX10-CU-NEXT: s_clause 0x1 5757; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5758; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5759; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5760; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 5761; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 5762; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 5763; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 5764; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 5765; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 5766; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5767; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 5768; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 5769; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5770; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 5771; GFX10-CU-NEXT: s_endpgm 5772; 5773; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: 5774; SKIP-CACHE-INV: ; %bb.0: ; %entry 5775; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 5776; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 5777; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5778; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 5779; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 5780; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 5781; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 5782; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 5783; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 5784; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5785; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 5786; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 5787; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5788; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 5789; SKIP-CACHE-INV-NEXT: s_endpgm 5790; 5791; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: 5792; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 5793; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5794; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5795; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5796; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 5797; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 5798; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 5799; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5800; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 5801; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 5802; 5803; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: 5804; GFX90A-TGSPLIT: ; %bb.0: ; %entry 5805; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5806; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5807; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5808; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 5809; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 5810; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 5811; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 5812; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 5813; GFX90A-TGSPLIT-NEXT: s_endpgm 5814; 5815; 5816 i32* %out, i32 %in, i32 %old) { 5817entry: 5818 %gep = getelementptr i32, i32* %out, i32 4 5819 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel acquire 5820 %val0 = extractvalue { i32, i1 } %val, 0 5821 store i32 %val0, i32* %out, align 4 5822 ret void 5823} 5824 5825define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( 5826; GFX7-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: 5827; GFX7: ; %bb.0: ; %entry 5828; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5829; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 5830; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5831; GFX7-NEXT: s_add_u32 s4, s0, 16 5832; GFX7-NEXT: s_addc_u32 s5, s1, 0 5833; GFX7-NEXT: v_mov_b32_e32 v0, s4 5834; GFX7-NEXT: v_mov_b32_e32 v2, s2 5835; GFX7-NEXT: v_mov_b32_e32 v1, s5 5836; GFX7-NEXT: v_mov_b32_e32 v3, s3 5837; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5838; GFX7-NEXT: v_mov_b32_e32 v0, s0 5839; GFX7-NEXT: v_mov_b32_e32 v1, s1 5840; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5841; GFX7-NEXT: flat_store_dword v[0:1], v2 5842; GFX7-NEXT: s_endpgm 5843; 5844; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: 5845; GFX10-WGP: ; %bb.0: ; %entry 5846; GFX10-WGP-NEXT: s_clause 0x1 5847; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5848; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5849; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 5850; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 5851; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 5852; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 5853; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 5854; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 5855; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 5856; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5857; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 5858; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 5859; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5860; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 5861; GFX10-WGP-NEXT: s_endpgm 5862; 5863; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: 5864; GFX10-CU: ; %bb.0: ; %entry 5865; GFX10-CU-NEXT: s_clause 0x1 5866; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5867; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5868; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5869; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 5870; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 5871; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 5872; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 5873; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 5874; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 5875; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5876; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 5877; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 5878; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5879; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 5880; GFX10-CU-NEXT: s_endpgm 5881; 5882; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: 5883; SKIP-CACHE-INV: ; %bb.0: ; %entry 5884; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 5885; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 5886; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5887; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 5888; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 5889; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 5890; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 5891; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 5892; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 5893; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5894; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 5895; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 5896; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5897; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 5898; SKIP-CACHE-INV-NEXT: s_endpgm 5899; 5900; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: 5901; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 5902; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5903; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5904; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5905; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 5906; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 5907; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 5908; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5909; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 5910; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 5911; 5912; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: 5913; GFX90A-TGSPLIT: ; %bb.0: ; %entry 5914; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5915; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5916; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5917; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 5918; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 5919; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 5920; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 5921; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 5922; GFX90A-TGSPLIT-NEXT: s_endpgm 5923; 5924; 5925 i32* %out, i32 %in, i32 %old) { 5926entry: 5927 %gep = getelementptr i32, i32* %out, i32 4 5928 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst acquire 5929 %val0 = extractvalue { i32, i1 } %val, 0 5930 store i32 %val0, i32* %out, align 4 5931 ret void 5932} 5933 5934define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( 5935; GFX7-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: 5936; GFX7: ; %bb.0: ; %entry 5937; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5938; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 5939; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5940; GFX7-NEXT: s_add_u32 s4, s0, 16 5941; GFX7-NEXT: s_addc_u32 s5, s1, 0 5942; GFX7-NEXT: v_mov_b32_e32 v0, s4 5943; GFX7-NEXT: v_mov_b32_e32 v2, s2 5944; GFX7-NEXT: v_mov_b32_e32 v1, s5 5945; GFX7-NEXT: v_mov_b32_e32 v3, s3 5946; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5947; GFX7-NEXT: v_mov_b32_e32 v0, s0 5948; GFX7-NEXT: v_mov_b32_e32 v1, s1 5949; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5950; GFX7-NEXT: flat_store_dword v[0:1], v2 5951; GFX7-NEXT: s_endpgm 5952; 5953; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: 5954; GFX10-WGP: ; %bb.0: ; %entry 5955; GFX10-WGP-NEXT: s_clause 0x1 5956; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5957; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5958; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 5959; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 5960; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 5961; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 5962; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 5963; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 5964; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 5965; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5966; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 5967; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 5968; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5969; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 5970; GFX10-WGP-NEXT: s_endpgm 5971; 5972; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: 5973; GFX10-CU: ; %bb.0: ; %entry 5974; GFX10-CU-NEXT: s_clause 0x1 5975; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5976; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5977; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5978; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 5979; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 5980; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 5981; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 5982; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 5983; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 5984; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5985; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 5986; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 5987; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5988; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 5989; GFX10-CU-NEXT: s_endpgm 5990; 5991; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: 5992; SKIP-CACHE-INV: ; %bb.0: ; %entry 5993; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 5994; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 5995; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5996; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 5997; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 5998; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 5999; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 6000; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 6001; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 6002; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 6003; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 6004; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 6005; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6006; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 6007; SKIP-CACHE-INV-NEXT: s_endpgm 6008; 6009; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: 6010; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 6011; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6012; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6013; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6014; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 6015; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 6016; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 6017; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6018; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 6019; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 6020; 6021; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: 6022; GFX90A-TGSPLIT: ; %bb.0: ; %entry 6023; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6024; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 6025; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6026; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 6027; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 6028; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 6029; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 6030; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 6031; GFX90A-TGSPLIT-NEXT: s_endpgm 6032; 6033; 6034 i32* %out, i32 %in, i32 %old) { 6035entry: 6036 %gep = getelementptr i32, i32* %out, i32 4 6037 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst seq_cst 6038 %val0 = extractvalue { i32, i1 } %val, 0 6039 store i32 %val0, i32* %out, align 4 6040 ret void 6041} 6042 6043