1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX6 %s 3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s 4; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s 5; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s 6; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s 7; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s 8; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s 9 10define amdgpu_kernel void @local_workgroup_unordered_load( 11; GFX6-LABEL: local_workgroup_unordered_load: 12; GFX6: ; %bb.0: ; %entry 13; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 14; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 15; GFX6-NEXT: s_mov_b32 m0, -1 16; GFX6-NEXT: s_waitcnt lgkmcnt(0) 17; GFX6-NEXT: v_mov_b32_e32 v0, s0 18; GFX6-NEXT: ds_read_b32 v0, v0 19; GFX6-NEXT: v_mov_b32_e32 v1, s1 20; GFX6-NEXT: s_waitcnt lgkmcnt(0) 21; GFX6-NEXT: ds_write_b32 v1, v0 22; GFX6-NEXT: s_endpgm 23; 24; GFX7-LABEL: local_workgroup_unordered_load: 25; GFX7: ; %bb.0: ; %entry 26; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 27; GFX7-NEXT: s_mov_b32 m0, -1 28; GFX7-NEXT: s_waitcnt lgkmcnt(0) 29; GFX7-NEXT: v_mov_b32_e32 v0, s0 30; GFX7-NEXT: ds_read_b32 v0, v0 31; GFX7-NEXT: v_mov_b32_e32 v1, s1 32; GFX7-NEXT: s_waitcnt lgkmcnt(0) 33; GFX7-NEXT: ds_write_b32 v1, v0 34; GFX7-NEXT: s_endpgm 35; 36; GFX10-WGP-LABEL: local_workgroup_unordered_load: 37; GFX10-WGP: ; %bb.0: ; %entry 38; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 39; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 40; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 41; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 42; GFX10-WGP-NEXT: ds_read_b32 v0, v0 43; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 44; GFX10-WGP-NEXT: ds_write_b32 v1, v0 45; GFX10-WGP-NEXT: s_endpgm 46; 47; GFX10-CU-LABEL: local_workgroup_unordered_load: 48; GFX10-CU: ; %bb.0: ; %entry 49; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 50; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 51; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 52; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 53; GFX10-CU-NEXT: ds_read_b32 v0, v0 54; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 55; GFX10-CU-NEXT: ds_write_b32 v1, v0 56; GFX10-CU-NEXT: s_endpgm 57; 58; SKIP-CACHE-INV-LABEL: local_workgroup_unordered_load: 59; SKIP-CACHE-INV: ; %bb.0: ; %entry 60; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 61; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 62; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 63; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 64; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0 65; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 66; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 67; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 68; SKIP-CACHE-INV-NEXT: s_endpgm 69; 70; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_unordered_load: 71; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 72; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 73; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 74; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 75; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 76; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 77; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 78; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 79; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 80; 81; GFX90A-TGSPLIT-LABEL: local_workgroup_unordered_load: 82; GFX90A-TGSPLIT: ; %bb.0: ; %entry 83; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 84; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 85; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 86; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 87; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 88; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 89; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 90; GFX90A-TGSPLIT-NEXT: s_endpgm 91 i32 addrspace(3)* %in, i32 addrspace(3)* %out) { 92entry: 93 %val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup") unordered, align 4 94 store i32 %val, i32 addrspace(3)* %out 95 ret void 96} 97 98define amdgpu_kernel void @local_workgroup_monotonic_load( 99; GFX6-LABEL: local_workgroup_monotonic_load: 100; GFX6: ; %bb.0: ; %entry 101; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 102; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 103; GFX6-NEXT: s_mov_b32 m0, -1 104; GFX6-NEXT: s_waitcnt lgkmcnt(0) 105; GFX6-NEXT: v_mov_b32_e32 v0, s0 106; GFX6-NEXT: ds_read_b32 v0, v0 107; GFX6-NEXT: v_mov_b32_e32 v1, s1 108; GFX6-NEXT: s_waitcnt lgkmcnt(0) 109; GFX6-NEXT: ds_write_b32 v1, v0 110; GFX6-NEXT: s_endpgm 111; 112; GFX7-LABEL: local_workgroup_monotonic_load: 113; GFX7: ; %bb.0: ; %entry 114; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 115; GFX7-NEXT: s_mov_b32 m0, -1 116; GFX7-NEXT: s_waitcnt lgkmcnt(0) 117; GFX7-NEXT: v_mov_b32_e32 v0, s0 118; GFX7-NEXT: ds_read_b32 v0, v0 119; GFX7-NEXT: v_mov_b32_e32 v1, s1 120; GFX7-NEXT: s_waitcnt lgkmcnt(0) 121; GFX7-NEXT: ds_write_b32 v1, v0 122; GFX7-NEXT: s_endpgm 123; 124; GFX10-WGP-LABEL: local_workgroup_monotonic_load: 125; GFX10-WGP: ; %bb.0: ; %entry 126; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 127; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 128; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 129; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 130; GFX10-WGP-NEXT: ds_read_b32 v0, v0 131; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 132; GFX10-WGP-NEXT: ds_write_b32 v1, v0 133; GFX10-WGP-NEXT: s_endpgm 134; 135; GFX10-CU-LABEL: local_workgroup_monotonic_load: 136; GFX10-CU: ; %bb.0: ; %entry 137; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 138; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 139; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 140; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 141; GFX10-CU-NEXT: ds_read_b32 v0, v0 142; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 143; GFX10-CU-NEXT: ds_write_b32 v1, v0 144; GFX10-CU-NEXT: s_endpgm 145; 146; SKIP-CACHE-INV-LABEL: local_workgroup_monotonic_load: 147; SKIP-CACHE-INV: ; %bb.0: ; %entry 148; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 149; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 150; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 151; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 152; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0 153; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 154; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 155; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 156; SKIP-CACHE-INV-NEXT: s_endpgm 157; 158; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_monotonic_load: 159; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 160; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 161; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 162; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 163; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 164; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 165; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 166; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 167; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 168; 169; GFX90A-TGSPLIT-LABEL: local_workgroup_monotonic_load: 170; GFX90A-TGSPLIT: ; %bb.0: ; %entry 171; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 172; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 173; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 174; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 175; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 176; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 177; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 178; GFX90A-TGSPLIT-NEXT: s_endpgm 179 i32 addrspace(3)* %in, i32 addrspace(3)* %out) { 180entry: 181 %val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup") monotonic, align 4 182 store i32 %val, i32 addrspace(3)* %out 183 ret void 184} 185 186define amdgpu_kernel void @local_workgroup_acquire_load( 187; GFX6-LABEL: local_workgroup_acquire_load: 188; GFX6: ; %bb.0: ; %entry 189; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 190; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 191; GFX6-NEXT: s_mov_b32 m0, -1 192; GFX6-NEXT: s_waitcnt lgkmcnt(0) 193; GFX6-NEXT: v_mov_b32_e32 v0, s0 194; GFX6-NEXT: ds_read_b32 v0, v0 195; GFX6-NEXT: s_waitcnt lgkmcnt(0) 196; GFX6-NEXT: v_mov_b32_e32 v1, s1 197; GFX6-NEXT: ds_write_b32 v1, v0 198; GFX6-NEXT: s_endpgm 199; 200; GFX7-LABEL: local_workgroup_acquire_load: 201; GFX7: ; %bb.0: ; %entry 202; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 203; GFX7-NEXT: s_mov_b32 m0, -1 204; GFX7-NEXT: s_waitcnt lgkmcnt(0) 205; GFX7-NEXT: v_mov_b32_e32 v0, s0 206; GFX7-NEXT: ds_read_b32 v0, v0 207; GFX7-NEXT: s_waitcnt lgkmcnt(0) 208; GFX7-NEXT: v_mov_b32_e32 v1, s1 209; GFX7-NEXT: ds_write_b32 v1, v0 210; GFX7-NEXT: s_endpgm 211; 212; GFX10-WGP-LABEL: local_workgroup_acquire_load: 213; GFX10-WGP: ; %bb.0: ; %entry 214; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 215; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 216; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 217; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 218; GFX10-WGP-NEXT: ds_read_b32 v0, v0 219; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 220; GFX10-WGP-NEXT: buffer_gl0_inv 221; GFX10-WGP-NEXT: ds_write_b32 v1, v0 222; GFX10-WGP-NEXT: s_endpgm 223; 224; GFX10-CU-LABEL: local_workgroup_acquire_load: 225; GFX10-CU: ; %bb.0: ; %entry 226; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 227; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 228; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 229; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 230; GFX10-CU-NEXT: ds_read_b32 v0, v0 231; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 232; GFX10-CU-NEXT: ds_write_b32 v1, v0 233; GFX10-CU-NEXT: s_endpgm 234; 235; SKIP-CACHE-INV-LABEL: local_workgroup_acquire_load: 236; SKIP-CACHE-INV: ; %bb.0: ; %entry 237; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 238; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 239; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 240; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 241; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0 242; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 243; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 244; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 245; SKIP-CACHE-INV-NEXT: s_endpgm 246; 247; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acquire_load: 248; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 249; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 250; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 251; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 252; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 253; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 254; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 255; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 256; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 257; 258; GFX90A-TGSPLIT-LABEL: local_workgroup_acquire_load: 259; GFX90A-TGSPLIT: ; %bb.0: ; %entry 260; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 261; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 262; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 263; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 264; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 265; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 266; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 267; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 268; GFX90A-TGSPLIT-NEXT: s_endpgm 269 i32 addrspace(3)* %in, i32 addrspace(3)* %out) { 270entry: 271 %val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup") acquire, align 4 272 store i32 %val, i32 addrspace(3)* %out 273 ret void 274} 275 276define amdgpu_kernel void @local_workgroup_seq_cst_load( 277; GFX6-LABEL: local_workgroup_seq_cst_load: 278; GFX6: ; %bb.0: ; %entry 279; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 280; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 281; GFX6-NEXT: s_mov_b32 m0, -1 282; GFX6-NEXT: s_waitcnt lgkmcnt(0) 283; GFX6-NEXT: v_mov_b32_e32 v0, s0 284; GFX6-NEXT: s_waitcnt lgkmcnt(0) 285; GFX6-NEXT: ds_read_b32 v0, v0 286; GFX6-NEXT: s_waitcnt lgkmcnt(0) 287; GFX6-NEXT: v_mov_b32_e32 v1, s1 288; GFX6-NEXT: ds_write_b32 v1, v0 289; GFX6-NEXT: s_endpgm 290; 291; GFX7-LABEL: local_workgroup_seq_cst_load: 292; GFX7: ; %bb.0: ; %entry 293; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 294; GFX7-NEXT: s_mov_b32 m0, -1 295; GFX7-NEXT: s_waitcnt lgkmcnt(0) 296; GFX7-NEXT: v_mov_b32_e32 v0, s0 297; GFX7-NEXT: s_waitcnt lgkmcnt(0) 298; GFX7-NEXT: ds_read_b32 v0, v0 299; GFX7-NEXT: s_waitcnt lgkmcnt(0) 300; GFX7-NEXT: v_mov_b32_e32 v1, s1 301; GFX7-NEXT: ds_write_b32 v1, v0 302; GFX7-NEXT: s_endpgm 303; 304; GFX10-WGP-LABEL: local_workgroup_seq_cst_load: 305; GFX10-WGP: ; %bb.0: ; %entry 306; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 307; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 308; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 309; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 310; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 311; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 312; GFX10-WGP-NEXT: ds_read_b32 v0, v0 313; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 314; GFX10-WGP-NEXT: buffer_gl0_inv 315; GFX10-WGP-NEXT: ds_write_b32 v1, v0 316; GFX10-WGP-NEXT: s_endpgm 317; 318; GFX10-CU-LABEL: local_workgroup_seq_cst_load: 319; GFX10-CU: ; %bb.0: ; %entry 320; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 321; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 322; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 323; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 324; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 325; GFX10-CU-NEXT: ds_read_b32 v0, v0 326; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 327; GFX10-CU-NEXT: ds_write_b32 v1, v0 328; GFX10-CU-NEXT: s_endpgm 329; 330; SKIP-CACHE-INV-LABEL: local_workgroup_seq_cst_load: 331; SKIP-CACHE-INV: ; %bb.0: ; %entry 332; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 333; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 334; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 335; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 336; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 337; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0 338; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 339; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 340; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 341; SKIP-CACHE-INV-NEXT: s_endpgm 342; 343; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_load: 344; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 345; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 346; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 347; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 348; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 349; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 350; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 351; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 352; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 353; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 354; 355; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_load: 356; GFX90A-TGSPLIT: ; %bb.0: ; %entry 357; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 358; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 359; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 360; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 361; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 362; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 363; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 364; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 365; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 366; GFX90A-TGSPLIT-NEXT: s_endpgm 367 i32 addrspace(3)* %in, i32 addrspace(3)* %out) { 368entry: 369 %val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup") seq_cst, align 4 370 store i32 %val, i32 addrspace(3)* %out 371 ret void 372} 373 374define amdgpu_kernel void @local_workgroup_unordered_store( 375; GFX6-LABEL: local_workgroup_unordered_store: 376; GFX6: ; %bb.0: ; %entry 377; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 378; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 379; GFX6-NEXT: s_mov_b32 m0, -1 380; GFX6-NEXT: s_waitcnt lgkmcnt(0) 381; GFX6-NEXT: v_mov_b32_e32 v1, s0 382; GFX6-NEXT: v_mov_b32_e32 v0, s1 383; GFX6-NEXT: ds_write_b32 v0, v1 384; GFX6-NEXT: s_endpgm 385; 386; GFX7-LABEL: local_workgroup_unordered_store: 387; GFX7: ; %bb.0: ; %entry 388; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 389; GFX7-NEXT: s_mov_b32 m0, -1 390; GFX7-NEXT: s_waitcnt lgkmcnt(0) 391; GFX7-NEXT: v_mov_b32_e32 v0, s1 392; GFX7-NEXT: v_mov_b32_e32 v1, s0 393; GFX7-NEXT: ds_write_b32 v0, v1 394; GFX7-NEXT: s_endpgm 395; 396; GFX10-WGP-LABEL: local_workgroup_unordered_store: 397; GFX10-WGP: ; %bb.0: ; %entry 398; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 399; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 400; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 401; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 402; GFX10-WGP-NEXT: ds_write_b32 v0, v1 403; GFX10-WGP-NEXT: s_endpgm 404; 405; GFX10-CU-LABEL: local_workgroup_unordered_store: 406; GFX10-CU: ; %bb.0: ; %entry 407; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 408; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 409; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 410; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 411; GFX10-CU-NEXT: ds_write_b32 v0, v1 412; GFX10-CU-NEXT: s_endpgm 413; 414; SKIP-CACHE-INV-LABEL: local_workgroup_unordered_store: 415; SKIP-CACHE-INV: ; %bb.0: ; %entry 416; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 417; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 418; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 419; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 420; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 421; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 422; SKIP-CACHE-INV-NEXT: s_endpgm 423; 424; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_unordered_store: 425; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 426; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 427; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 428; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 429; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 430; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 431; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 432; 433; GFX90A-TGSPLIT-LABEL: local_workgroup_unordered_store: 434; GFX90A-TGSPLIT: ; %bb.0: ; %entry 435; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 436; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 437; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 438; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 439; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 440; GFX90A-TGSPLIT-NEXT: s_endpgm 441 i32 %in, i32 addrspace(3)* %out) { 442entry: 443 store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup") unordered, align 4 444 ret void 445} 446 447define amdgpu_kernel void @local_workgroup_monotonic_store( 448; GFX6-LABEL: local_workgroup_monotonic_store: 449; GFX6: ; %bb.0: ; %entry 450; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 451; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 452; GFX6-NEXT: s_mov_b32 m0, -1 453; GFX6-NEXT: s_waitcnt lgkmcnt(0) 454; GFX6-NEXT: v_mov_b32_e32 v1, s0 455; GFX6-NEXT: v_mov_b32_e32 v0, s1 456; GFX6-NEXT: ds_write_b32 v0, v1 457; GFX6-NEXT: s_endpgm 458; 459; GFX7-LABEL: local_workgroup_monotonic_store: 460; GFX7: ; %bb.0: ; %entry 461; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 462; GFX7-NEXT: s_mov_b32 m0, -1 463; GFX7-NEXT: s_waitcnt lgkmcnt(0) 464; GFX7-NEXT: v_mov_b32_e32 v0, s1 465; GFX7-NEXT: v_mov_b32_e32 v1, s0 466; GFX7-NEXT: ds_write_b32 v0, v1 467; GFX7-NEXT: s_endpgm 468; 469; GFX10-WGP-LABEL: local_workgroup_monotonic_store: 470; GFX10-WGP: ; %bb.0: ; %entry 471; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 472; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 473; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 474; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 475; GFX10-WGP-NEXT: ds_write_b32 v0, v1 476; GFX10-WGP-NEXT: s_endpgm 477; 478; GFX10-CU-LABEL: local_workgroup_monotonic_store: 479; GFX10-CU: ; %bb.0: ; %entry 480; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 481; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 482; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 483; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 484; GFX10-CU-NEXT: ds_write_b32 v0, v1 485; GFX10-CU-NEXT: s_endpgm 486; 487; SKIP-CACHE-INV-LABEL: local_workgroup_monotonic_store: 488; SKIP-CACHE-INV: ; %bb.0: ; %entry 489; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 490; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 491; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 492; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 493; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 494; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 495; SKIP-CACHE-INV-NEXT: s_endpgm 496; 497; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_monotonic_store: 498; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 499; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 500; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 501; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 502; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 503; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 504; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 505; 506; GFX90A-TGSPLIT-LABEL: local_workgroup_monotonic_store: 507; GFX90A-TGSPLIT: ; %bb.0: ; %entry 508; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 509; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 510; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 511; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 512; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 513; GFX90A-TGSPLIT-NEXT: s_endpgm 514 i32 %in, i32 addrspace(3)* %out) { 515entry: 516 store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup") monotonic, align 4 517 ret void 518} 519 520define amdgpu_kernel void @local_workgroup_release_store( 521; GFX6-LABEL: local_workgroup_release_store: 522; GFX6: ; %bb.0: ; %entry 523; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 524; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 525; GFX6-NEXT: s_mov_b32 m0, -1 526; GFX6-NEXT: s_waitcnt lgkmcnt(0) 527; GFX6-NEXT: v_mov_b32_e32 v1, s0 528; GFX6-NEXT: v_mov_b32_e32 v0, s1 529; GFX6-NEXT: s_waitcnt lgkmcnt(0) 530; GFX6-NEXT: ds_write_b32 v0, v1 531; GFX6-NEXT: s_endpgm 532; 533; GFX7-LABEL: local_workgroup_release_store: 534; GFX7: ; %bb.0: ; %entry 535; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 536; GFX7-NEXT: s_mov_b32 m0, -1 537; GFX7-NEXT: s_waitcnt lgkmcnt(0) 538; GFX7-NEXT: v_mov_b32_e32 v0, s1 539; GFX7-NEXT: v_mov_b32_e32 v1, s0 540; GFX7-NEXT: s_waitcnt lgkmcnt(0) 541; GFX7-NEXT: ds_write_b32 v0, v1 542; GFX7-NEXT: s_endpgm 543; 544; GFX10-WGP-LABEL: local_workgroup_release_store: 545; GFX10-WGP: ; %bb.0: ; %entry 546; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 547; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 548; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 549; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 550; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 551; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 552; GFX10-WGP-NEXT: ds_write_b32 v0, v1 553; GFX10-WGP-NEXT: s_endpgm 554; 555; GFX10-CU-LABEL: local_workgroup_release_store: 556; GFX10-CU: ; %bb.0: ; %entry 557; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 558; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 559; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 560; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 561; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 562; GFX10-CU-NEXT: ds_write_b32 v0, v1 563; GFX10-CU-NEXT: s_endpgm 564; 565; SKIP-CACHE-INV-LABEL: local_workgroup_release_store: 566; SKIP-CACHE-INV: ; %bb.0: ; %entry 567; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 568; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 569; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 570; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 571; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 572; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 573; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 574; SKIP-CACHE-INV-NEXT: s_endpgm 575; 576; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_release_store: 577; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 578; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 579; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 580; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 581; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 582; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 583; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 584; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 585; 586; GFX90A-TGSPLIT-LABEL: local_workgroup_release_store: 587; GFX90A-TGSPLIT: ; %bb.0: ; %entry 588; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 589; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 590; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 591; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 592; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 593; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 594; GFX90A-TGSPLIT-NEXT: s_endpgm 595 i32 %in, i32 addrspace(3)* %out) { 596entry: 597 store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup") release, align 4 598 ret void 599} 600 601define amdgpu_kernel void @local_workgroup_seq_cst_store( 602; GFX6-LABEL: local_workgroup_seq_cst_store: 603; GFX6: ; %bb.0: ; %entry 604; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 605; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 606; GFX6-NEXT: s_mov_b32 m0, -1 607; GFX6-NEXT: s_waitcnt lgkmcnt(0) 608; GFX6-NEXT: v_mov_b32_e32 v1, s0 609; GFX6-NEXT: v_mov_b32_e32 v0, s1 610; GFX6-NEXT: s_waitcnt lgkmcnt(0) 611; GFX6-NEXT: ds_write_b32 v0, v1 612; GFX6-NEXT: s_endpgm 613; 614; GFX7-LABEL: local_workgroup_seq_cst_store: 615; GFX7: ; %bb.0: ; %entry 616; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 617; GFX7-NEXT: s_mov_b32 m0, -1 618; GFX7-NEXT: s_waitcnt lgkmcnt(0) 619; GFX7-NEXT: v_mov_b32_e32 v0, s1 620; GFX7-NEXT: v_mov_b32_e32 v1, s0 621; GFX7-NEXT: s_waitcnt lgkmcnt(0) 622; GFX7-NEXT: ds_write_b32 v0, v1 623; GFX7-NEXT: s_endpgm 624; 625; GFX10-WGP-LABEL: local_workgroup_seq_cst_store: 626; GFX10-WGP: ; %bb.0: ; %entry 627; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 628; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 629; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 630; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 631; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 632; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 633; GFX10-WGP-NEXT: ds_write_b32 v0, v1 634; GFX10-WGP-NEXT: s_endpgm 635; 636; GFX10-CU-LABEL: local_workgroup_seq_cst_store: 637; GFX10-CU: ; %bb.0: ; %entry 638; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 639; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 640; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 641; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 642; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 643; GFX10-CU-NEXT: ds_write_b32 v0, v1 644; GFX10-CU-NEXT: s_endpgm 645; 646; SKIP-CACHE-INV-LABEL: local_workgroup_seq_cst_store: 647; SKIP-CACHE-INV: ; %bb.0: ; %entry 648; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 649; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 650; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 651; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 652; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 653; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 654; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 655; SKIP-CACHE-INV-NEXT: s_endpgm 656; 657; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_store: 658; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 659; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 660; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 661; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 662; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 663; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 664; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 665; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 666; 667; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_store: 668; GFX90A-TGSPLIT: ; %bb.0: ; %entry 669; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 670; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 671; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 672; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 673; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 674; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 675; GFX90A-TGSPLIT-NEXT: s_endpgm 676 i32 %in, i32 addrspace(3)* %out) { 677entry: 678 store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup") seq_cst, align 4 679 ret void 680} 681 682define amdgpu_kernel void @local_workgroup_monotonic_atomicrmw( 683; GFX6-LABEL: local_workgroup_monotonic_atomicrmw: 684; GFX6: ; %bb.0: ; %entry 685; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 686; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 687; GFX6-NEXT: s_mov_b32 m0, -1 688; GFX6-NEXT: s_waitcnt lgkmcnt(0) 689; GFX6-NEXT: v_mov_b32_e32 v0, s0 690; GFX6-NEXT: v_mov_b32_e32 v1, s1 691; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 692; GFX6-NEXT: s_endpgm 693; 694; GFX7-LABEL: local_workgroup_monotonic_atomicrmw: 695; GFX7: ; %bb.0: ; %entry 696; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 697; GFX7-NEXT: s_mov_b32 m0, -1 698; GFX7-NEXT: s_waitcnt lgkmcnt(0) 699; GFX7-NEXT: v_mov_b32_e32 v0, s0 700; GFX7-NEXT: v_mov_b32_e32 v1, s1 701; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 702; GFX7-NEXT: s_endpgm 703; 704; GFX10-WGP-LABEL: local_workgroup_monotonic_atomicrmw: 705; GFX10-WGP: ; %bb.0: ; %entry 706; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 707; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 708; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 709; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 710; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 711; GFX10-WGP-NEXT: s_endpgm 712; 713; GFX10-CU-LABEL: local_workgroup_monotonic_atomicrmw: 714; GFX10-CU: ; %bb.0: ; %entry 715; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 716; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 717; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 718; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 719; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 720; GFX10-CU-NEXT: s_endpgm 721; 722; SKIP-CACHE-INV-LABEL: local_workgroup_monotonic_atomicrmw: 723; SKIP-CACHE-INV: ; %bb.0: ; %entry 724; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 725; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 726; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 727; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 728; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 729; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 730; SKIP-CACHE-INV-NEXT: s_endpgm 731; 732; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_monotonic_atomicrmw: 733; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 734; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 735; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 736; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 737; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 738; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 739; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 740; 741; GFX90A-TGSPLIT-LABEL: local_workgroup_monotonic_atomicrmw: 742; GFX90A-TGSPLIT: ; %bb.0: ; %entry 743; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 744; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 745; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 746; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 747; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 748; GFX90A-TGSPLIT-NEXT: s_endpgm 749 i32 addrspace(3)* %out, i32 %in) { 750entry: 751 %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup") monotonic 752 ret void 753} 754 755define amdgpu_kernel void @local_workgroup_acquire_atomicrmw( 756; GFX6-LABEL: local_workgroup_acquire_atomicrmw: 757; GFX6: ; %bb.0: ; %entry 758; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 759; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 760; GFX6-NEXT: s_mov_b32 m0, -1 761; GFX6-NEXT: s_waitcnt lgkmcnt(0) 762; GFX6-NEXT: v_mov_b32_e32 v0, s0 763; GFX6-NEXT: v_mov_b32_e32 v1, s1 764; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 765; GFX6-NEXT: s_waitcnt lgkmcnt(0) 766; GFX6-NEXT: s_endpgm 767; 768; GFX7-LABEL: local_workgroup_acquire_atomicrmw: 769; GFX7: ; %bb.0: ; %entry 770; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 771; GFX7-NEXT: s_mov_b32 m0, -1 772; GFX7-NEXT: s_waitcnt lgkmcnt(0) 773; GFX7-NEXT: v_mov_b32_e32 v0, s0 774; GFX7-NEXT: v_mov_b32_e32 v1, s1 775; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 776; GFX7-NEXT: s_waitcnt lgkmcnt(0) 777; GFX7-NEXT: s_endpgm 778; 779; GFX10-WGP-LABEL: local_workgroup_acquire_atomicrmw: 780; GFX10-WGP: ; %bb.0: ; %entry 781; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 782; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 783; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 784; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 785; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 786; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 787; GFX10-WGP-NEXT: buffer_gl0_inv 788; GFX10-WGP-NEXT: s_endpgm 789; 790; GFX10-CU-LABEL: local_workgroup_acquire_atomicrmw: 791; GFX10-CU: ; %bb.0: ; %entry 792; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 793; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 794; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 795; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 796; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 797; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 798; GFX10-CU-NEXT: s_endpgm 799; 800; SKIP-CACHE-INV-LABEL: local_workgroup_acquire_atomicrmw: 801; SKIP-CACHE-INV: ; %bb.0: ; %entry 802; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 803; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 804; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 805; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 806; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 807; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 808; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 809; SKIP-CACHE-INV-NEXT: s_endpgm 810; 811; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acquire_atomicrmw: 812; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 813; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 814; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 815; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 816; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 817; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 818; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 819; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 820; 821; GFX90A-TGSPLIT-LABEL: local_workgroup_acquire_atomicrmw: 822; GFX90A-TGSPLIT: ; %bb.0: ; %entry 823; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 824; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 825; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 826; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 827; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 828; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 829; GFX90A-TGSPLIT-NEXT: s_endpgm 830 i32 addrspace(3)* %out, i32 %in) { 831entry: 832 %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup") acquire 833 ret void 834} 835 836define amdgpu_kernel void @local_workgroup_release_atomicrmw( 837; GFX6-LABEL: local_workgroup_release_atomicrmw: 838; GFX6: ; %bb.0: ; %entry 839; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 840; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 841; GFX6-NEXT: s_mov_b32 m0, -1 842; GFX6-NEXT: s_waitcnt lgkmcnt(0) 843; GFX6-NEXT: v_mov_b32_e32 v0, s0 844; GFX6-NEXT: v_mov_b32_e32 v1, s1 845; GFX6-NEXT: s_waitcnt lgkmcnt(0) 846; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 847; GFX6-NEXT: s_endpgm 848; 849; GFX7-LABEL: local_workgroup_release_atomicrmw: 850; GFX7: ; %bb.0: ; %entry 851; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 852; GFX7-NEXT: s_mov_b32 m0, -1 853; GFX7-NEXT: s_waitcnt lgkmcnt(0) 854; GFX7-NEXT: v_mov_b32_e32 v0, s0 855; GFX7-NEXT: v_mov_b32_e32 v1, s1 856; GFX7-NEXT: s_waitcnt lgkmcnt(0) 857; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 858; GFX7-NEXT: s_endpgm 859; 860; GFX10-WGP-LABEL: local_workgroup_release_atomicrmw: 861; GFX10-WGP: ; %bb.0: ; %entry 862; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 863; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 864; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 865; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 866; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 867; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 868; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 869; GFX10-WGP-NEXT: s_endpgm 870; 871; GFX10-CU-LABEL: local_workgroup_release_atomicrmw: 872; GFX10-CU: ; %bb.0: ; %entry 873; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 874; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 875; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 876; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 877; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 878; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 879; GFX10-CU-NEXT: s_endpgm 880; 881; SKIP-CACHE-INV-LABEL: local_workgroup_release_atomicrmw: 882; SKIP-CACHE-INV: ; %bb.0: ; %entry 883; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 884; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 885; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 886; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 887; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 888; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 889; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 890; SKIP-CACHE-INV-NEXT: s_endpgm 891; 892; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_release_atomicrmw: 893; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 894; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 895; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 896; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 897; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 898; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 899; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 900; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 901; 902; GFX90A-TGSPLIT-LABEL: local_workgroup_release_atomicrmw: 903; GFX90A-TGSPLIT: ; %bb.0: ; %entry 904; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 905; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 906; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 907; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 908; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 909; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 910; GFX90A-TGSPLIT-NEXT: s_endpgm 911 i32 addrspace(3)* %out, i32 %in) { 912entry: 913 %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup") release 914 ret void 915} 916 917define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw( 918; GFX6-LABEL: local_workgroup_acq_rel_atomicrmw: 919; GFX6: ; %bb.0: ; %entry 920; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 921; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 922; GFX6-NEXT: s_mov_b32 m0, -1 923; GFX6-NEXT: s_waitcnt lgkmcnt(0) 924; GFX6-NEXT: v_mov_b32_e32 v0, s0 925; GFX6-NEXT: v_mov_b32_e32 v1, s1 926; GFX6-NEXT: s_waitcnt lgkmcnt(0) 927; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 928; GFX6-NEXT: s_waitcnt lgkmcnt(0) 929; GFX6-NEXT: s_endpgm 930; 931; GFX7-LABEL: local_workgroup_acq_rel_atomicrmw: 932; GFX7: ; %bb.0: ; %entry 933; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 934; GFX7-NEXT: s_mov_b32 m0, -1 935; GFX7-NEXT: s_waitcnt lgkmcnt(0) 936; GFX7-NEXT: v_mov_b32_e32 v0, s0 937; GFX7-NEXT: v_mov_b32_e32 v1, s1 938; GFX7-NEXT: s_waitcnt lgkmcnt(0) 939; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 940; GFX7-NEXT: s_waitcnt lgkmcnt(0) 941; GFX7-NEXT: s_endpgm 942; 943; GFX10-WGP-LABEL: local_workgroup_acq_rel_atomicrmw: 944; GFX10-WGP: ; %bb.0: ; %entry 945; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 946; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 947; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 948; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 949; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 950; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 951; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 952; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 953; GFX10-WGP-NEXT: buffer_gl0_inv 954; GFX10-WGP-NEXT: s_endpgm 955; 956; GFX10-CU-LABEL: local_workgroup_acq_rel_atomicrmw: 957; GFX10-CU: ; %bb.0: ; %entry 958; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 959; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 960; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 961; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 962; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 963; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 964; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 965; GFX10-CU-NEXT: s_endpgm 966; 967; SKIP-CACHE-INV-LABEL: local_workgroup_acq_rel_atomicrmw: 968; SKIP-CACHE-INV: ; %bb.0: ; %entry 969; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 970; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 971; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 972; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 973; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 974; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 975; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 976; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 977; SKIP-CACHE-INV-NEXT: s_endpgm 978; 979; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_atomicrmw: 980; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 981; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 982; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 983; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 984; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 985; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 986; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 987; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 988; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 989; 990; GFX90A-TGSPLIT-LABEL: local_workgroup_acq_rel_atomicrmw: 991; GFX90A-TGSPLIT: ; %bb.0: ; %entry 992; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 993; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 994; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 995; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 996; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 997; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 998; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 999; GFX90A-TGSPLIT-NEXT: s_endpgm 1000 i32 addrspace(3)* %out, i32 %in) { 1001entry: 1002 %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup") acq_rel 1003 ret void 1004} 1005 1006define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw( 1007; GFX6-LABEL: local_workgroup_seq_cst_atomicrmw: 1008; GFX6: ; %bb.0: ; %entry 1009; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 1010; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 1011; GFX6-NEXT: s_mov_b32 m0, -1 1012; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1013; GFX6-NEXT: v_mov_b32_e32 v0, s0 1014; GFX6-NEXT: v_mov_b32_e32 v1, s1 1015; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1016; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 1017; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1018; GFX6-NEXT: s_endpgm 1019; 1020; GFX7-LABEL: local_workgroup_seq_cst_atomicrmw: 1021; GFX7: ; %bb.0: ; %entry 1022; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1023; GFX7-NEXT: s_mov_b32 m0, -1 1024; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1025; GFX7-NEXT: v_mov_b32_e32 v0, s0 1026; GFX7-NEXT: v_mov_b32_e32 v1, s1 1027; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1028; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 1029; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1030; GFX7-NEXT: s_endpgm 1031; 1032; GFX10-WGP-LABEL: local_workgroup_seq_cst_atomicrmw: 1033; GFX10-WGP: ; %bb.0: ; %entry 1034; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1035; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1036; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1037; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 1038; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1039; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 1040; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 1041; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1042; GFX10-WGP-NEXT: buffer_gl0_inv 1043; GFX10-WGP-NEXT: s_endpgm 1044; 1045; GFX10-CU-LABEL: local_workgroup_seq_cst_atomicrmw: 1046; GFX10-CU: ; %bb.0: ; %entry 1047; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1048; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1049; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1050; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 1051; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1052; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 1053; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1054; GFX10-CU-NEXT: s_endpgm 1055; 1056; SKIP-CACHE-INV-LABEL: local_workgroup_seq_cst_atomicrmw: 1057; SKIP-CACHE-INV: ; %bb.0: ; %entry 1058; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1059; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 1060; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1061; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 1062; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 1063; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1064; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 1065; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1066; SKIP-CACHE-INV-NEXT: s_endpgm 1067; 1068; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_atomicrmw: 1069; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 1070; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1071; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1072; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 1073; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 1074; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1075; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 1076; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1077; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 1078; 1079; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_atomicrmw: 1080; GFX90A-TGSPLIT: ; %bb.0: ; %entry 1081; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1082; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1083; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 1084; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 1085; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1086; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 1087; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 1088; GFX90A-TGSPLIT-NEXT: s_endpgm 1089 i32 addrspace(3)* %out, i32 %in) { 1090entry: 1091 %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup") seq_cst 1092 ret void 1093} 1094 1095define amdgpu_kernel void @local_workgroup_acquire_ret_atomicrmw( 1096; GFX6-LABEL: local_workgroup_acquire_ret_atomicrmw: 1097; GFX6: ; %bb.0: ; %entry 1098; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 1099; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 1100; GFX6-NEXT: s_mov_b32 m0, -1 1101; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1102; GFX6-NEXT: v_mov_b32_e32 v0, s0 1103; GFX6-NEXT: v_mov_b32_e32 v1, s1 1104; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 1105; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1106; GFX6-NEXT: ds_write_b32 v0, v1 1107; GFX6-NEXT: s_endpgm 1108; 1109; GFX7-LABEL: local_workgroup_acquire_ret_atomicrmw: 1110; GFX7: ; %bb.0: ; %entry 1111; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1112; GFX7-NEXT: s_mov_b32 m0, -1 1113; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1114; GFX7-NEXT: v_mov_b32_e32 v0, s0 1115; GFX7-NEXT: v_mov_b32_e32 v1, s1 1116; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 1117; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1118; GFX7-NEXT: ds_write_b32 v0, v1 1119; GFX7-NEXT: s_endpgm 1120; 1121; GFX10-WGP-LABEL: local_workgroup_acquire_ret_atomicrmw: 1122; GFX10-WGP: ; %bb.0: ; %entry 1123; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1124; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1125; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1126; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 1127; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 1128; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1129; GFX10-WGP-NEXT: buffer_gl0_inv 1130; GFX10-WGP-NEXT: ds_write_b32 v0, v1 1131; GFX10-WGP-NEXT: s_endpgm 1132; 1133; GFX10-CU-LABEL: local_workgroup_acquire_ret_atomicrmw: 1134; GFX10-CU: ; %bb.0: ; %entry 1135; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1136; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1137; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1138; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 1139; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 1140; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1141; GFX10-CU-NEXT: ds_write_b32 v0, v1 1142; GFX10-CU-NEXT: s_endpgm 1143; 1144; SKIP-CACHE-INV-LABEL: local_workgroup_acquire_ret_atomicrmw: 1145; SKIP-CACHE-INV: ; %bb.0: ; %entry 1146; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1147; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 1148; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1149; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 1150; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 1151; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 1152; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1153; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 1154; SKIP-CACHE-INV-NEXT: s_endpgm 1155; 1156; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acquire_ret_atomicrmw: 1157; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 1158; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1159; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1160; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 1161; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 1162; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 1163; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1164; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 1165; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 1166; 1167; GFX90A-TGSPLIT-LABEL: local_workgroup_acquire_ret_atomicrmw: 1168; GFX90A-TGSPLIT: ; %bb.0: ; %entry 1169; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1170; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1171; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 1172; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 1173; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 1174; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 1175; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1176; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 1177; GFX90A-TGSPLIT-NEXT: s_endpgm 1178 i32 addrspace(3)* %out, i32 %in) { 1179entry: 1180 %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup") acquire 1181 store i32 %val, i32 addrspace(3)* %out, align 4 1182 ret void 1183} 1184 1185define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw( 1186; GFX6-LABEL: local_workgroup_acq_rel_ret_atomicrmw: 1187; GFX6: ; %bb.0: ; %entry 1188; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 1189; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 1190; GFX6-NEXT: s_mov_b32 m0, -1 1191; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1192; GFX6-NEXT: v_mov_b32_e32 v0, s0 1193; GFX6-NEXT: v_mov_b32_e32 v1, s1 1194; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1195; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 1196; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1197; GFX6-NEXT: ds_write_b32 v0, v1 1198; GFX6-NEXT: s_endpgm 1199; 1200; GFX7-LABEL: local_workgroup_acq_rel_ret_atomicrmw: 1201; GFX7: ; %bb.0: ; %entry 1202; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1203; GFX7-NEXT: s_mov_b32 m0, -1 1204; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1205; GFX7-NEXT: v_mov_b32_e32 v0, s0 1206; GFX7-NEXT: v_mov_b32_e32 v1, s1 1207; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1208; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 1209; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1210; GFX7-NEXT: ds_write_b32 v0, v1 1211; GFX7-NEXT: s_endpgm 1212; 1213; GFX10-WGP-LABEL: local_workgroup_acq_rel_ret_atomicrmw: 1214; GFX10-WGP: ; %bb.0: ; %entry 1215; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1216; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1217; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1218; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 1219; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1220; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 1221; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 1222; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1223; GFX10-WGP-NEXT: buffer_gl0_inv 1224; GFX10-WGP-NEXT: ds_write_b32 v0, v1 1225; GFX10-WGP-NEXT: s_endpgm 1226; 1227; GFX10-CU-LABEL: local_workgroup_acq_rel_ret_atomicrmw: 1228; GFX10-CU: ; %bb.0: ; %entry 1229; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1230; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1231; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1232; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 1233; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1234; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 1235; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1236; GFX10-CU-NEXT: ds_write_b32 v0, v1 1237; GFX10-CU-NEXT: s_endpgm 1238; 1239; SKIP-CACHE-INV-LABEL: local_workgroup_acq_rel_ret_atomicrmw: 1240; SKIP-CACHE-INV: ; %bb.0: ; %entry 1241; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1242; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 1243; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1244; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 1245; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 1246; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1247; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 1248; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1249; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 1250; SKIP-CACHE-INV-NEXT: s_endpgm 1251; 1252; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_ret_atomicrmw: 1253; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 1254; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1255; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1256; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 1257; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 1258; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1259; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 1260; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1261; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 1262; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 1263; 1264; GFX90A-TGSPLIT-LABEL: local_workgroup_acq_rel_ret_atomicrmw: 1265; GFX90A-TGSPLIT: ; %bb.0: ; %entry 1266; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1267; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1268; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 1269; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 1270; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1271; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 1272; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 1273; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1274; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 1275; GFX90A-TGSPLIT-NEXT: s_endpgm 1276 i32 addrspace(3)* %out, i32 %in) { 1277entry: 1278 %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup") acq_rel 1279 store i32 %val, i32 addrspace(3)* %out, align 4 1280 ret void 1281} 1282 1283define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw( 1284; GFX6-LABEL: local_workgroup_seq_cst_ret_atomicrmw: 1285; GFX6: ; %bb.0: ; %entry 1286; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 1287; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 1288; GFX6-NEXT: s_mov_b32 m0, -1 1289; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1290; GFX6-NEXT: v_mov_b32_e32 v0, s0 1291; GFX6-NEXT: v_mov_b32_e32 v1, s1 1292; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1293; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 1294; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1295; GFX6-NEXT: ds_write_b32 v0, v1 1296; GFX6-NEXT: s_endpgm 1297; 1298; GFX7-LABEL: local_workgroup_seq_cst_ret_atomicrmw: 1299; GFX7: ; %bb.0: ; %entry 1300; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1301; GFX7-NEXT: s_mov_b32 m0, -1 1302; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1303; GFX7-NEXT: v_mov_b32_e32 v0, s0 1304; GFX7-NEXT: v_mov_b32_e32 v1, s1 1305; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1306; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 1307; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1308; GFX7-NEXT: ds_write_b32 v0, v1 1309; GFX7-NEXT: s_endpgm 1310; 1311; GFX10-WGP-LABEL: local_workgroup_seq_cst_ret_atomicrmw: 1312; GFX10-WGP: ; %bb.0: ; %entry 1313; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1314; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1315; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1316; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 1317; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1318; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 1319; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 1320; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1321; GFX10-WGP-NEXT: buffer_gl0_inv 1322; GFX10-WGP-NEXT: ds_write_b32 v0, v1 1323; GFX10-WGP-NEXT: s_endpgm 1324; 1325; GFX10-CU-LABEL: local_workgroup_seq_cst_ret_atomicrmw: 1326; GFX10-CU: ; %bb.0: ; %entry 1327; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1328; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1329; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1330; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 1331; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1332; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 1333; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1334; GFX10-CU-NEXT: ds_write_b32 v0, v1 1335; GFX10-CU-NEXT: s_endpgm 1336; 1337; SKIP-CACHE-INV-LABEL: local_workgroup_seq_cst_ret_atomicrmw: 1338; SKIP-CACHE-INV: ; %bb.0: ; %entry 1339; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1340; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 1341; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1342; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 1343; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 1344; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1345; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 1346; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1347; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 1348; SKIP-CACHE-INV-NEXT: s_endpgm 1349; 1350; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_ret_atomicrmw: 1351; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 1352; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1353; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1354; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 1355; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 1356; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1357; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 1358; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1359; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 1360; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 1361; 1362; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_ret_atomicrmw: 1363; GFX90A-TGSPLIT: ; %bb.0: ; %entry 1364; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1365; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1366; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 1367; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 1368; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1369; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 1370; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 1371; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1372; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 1373; GFX90A-TGSPLIT-NEXT: s_endpgm 1374 i32 addrspace(3)* %out, i32 %in) { 1375entry: 1376 %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup") seq_cst 1377 store i32 %val, i32 addrspace(3)* %out, align 4 1378 ret void 1379} 1380 1381define amdgpu_kernel void @local_workgroup_monotonic_monotonic_cmpxchg( 1382; GFX6-LABEL: local_workgroup_monotonic_monotonic_cmpxchg: 1383; GFX6: ; %bb.0: ; %entry 1384; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 1385; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 1386; GFX6-NEXT: s_mov_b32 m0, -1 1387; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1388; GFX6-NEXT: v_mov_b32_e32 v0, s2 1389; GFX6-NEXT: v_mov_b32_e32 v1, s1 1390; GFX6-NEXT: v_mov_b32_e32 v2, s0 1391; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 1392; GFX6-NEXT: s_endpgm 1393; 1394; GFX7-LABEL: local_workgroup_monotonic_monotonic_cmpxchg: 1395; GFX7: ; %bb.0: ; %entry 1396; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1397; GFX7-NEXT: s_mov_b32 m0, -1 1398; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1399; GFX7-NEXT: v_mov_b32_e32 v0, s0 1400; GFX7-NEXT: v_mov_b32_e32 v1, s2 1401; GFX7-NEXT: v_mov_b32_e32 v2, s1 1402; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 1403; GFX7-NEXT: s_endpgm 1404; 1405; GFX10-WGP-LABEL: local_workgroup_monotonic_monotonic_cmpxchg: 1406; GFX10-WGP: ; %bb.0: ; %entry 1407; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1408; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1409; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1410; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 1411; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 1412; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 1413; GFX10-WGP-NEXT: s_endpgm 1414; 1415; GFX10-CU-LABEL: local_workgroup_monotonic_monotonic_cmpxchg: 1416; GFX10-CU: ; %bb.0: ; %entry 1417; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1418; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1419; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1420; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 1421; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 1422; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 1423; GFX10-CU-NEXT: s_endpgm 1424; 1425; SKIP-CACHE-INV-LABEL: local_workgroup_monotonic_monotonic_cmpxchg: 1426; SKIP-CACHE-INV: ; %bb.0: ; %entry 1427; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 1428; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 1429; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 1430; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1431; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1432; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 1433; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 1434; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 1435; SKIP-CACHE-INV-NEXT: s_endpgm 1436; 1437; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_monotonic_monotonic_cmpxchg: 1438; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 1439; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1440; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1441; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 1442; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 1443; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 1444; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 1445; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 1446; 1447; GFX90A-TGSPLIT-LABEL: local_workgroup_monotonic_monotonic_cmpxchg: 1448; GFX90A-TGSPLIT: ; %bb.0: ; %entry 1449; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1450; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1451; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 1452; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 1453; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 1454; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 1455; GFX90A-TGSPLIT-NEXT: s_endpgm 1456 i32 addrspace(3)* %out, i32 %in, i32 %old) { 1457entry: 1458 %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 1459 %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") monotonic monotonic 1460 ret void 1461} 1462 1463define amdgpu_kernel void @local_workgroup_acquire_monotonic_cmpxchg( 1464; GFX6-LABEL: local_workgroup_acquire_monotonic_cmpxchg: 1465; GFX6: ; %bb.0: ; %entry 1466; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 1467; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 1468; GFX6-NEXT: s_mov_b32 m0, -1 1469; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1470; GFX6-NEXT: v_mov_b32_e32 v0, s2 1471; GFX6-NEXT: v_mov_b32_e32 v1, s1 1472; GFX6-NEXT: v_mov_b32_e32 v2, s0 1473; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 1474; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1475; GFX6-NEXT: s_endpgm 1476; 1477; GFX7-LABEL: local_workgroup_acquire_monotonic_cmpxchg: 1478; GFX7: ; %bb.0: ; %entry 1479; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1480; GFX7-NEXT: s_mov_b32 m0, -1 1481; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1482; GFX7-NEXT: v_mov_b32_e32 v0, s0 1483; GFX7-NEXT: v_mov_b32_e32 v1, s2 1484; GFX7-NEXT: v_mov_b32_e32 v2, s1 1485; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 1486; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1487; GFX7-NEXT: s_endpgm 1488; 1489; GFX10-WGP-LABEL: local_workgroup_acquire_monotonic_cmpxchg: 1490; GFX10-WGP: ; %bb.0: ; %entry 1491; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1492; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1493; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1494; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 1495; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 1496; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 1497; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1498; GFX10-WGP-NEXT: buffer_gl0_inv 1499; GFX10-WGP-NEXT: s_endpgm 1500; 1501; GFX10-CU-LABEL: local_workgroup_acquire_monotonic_cmpxchg: 1502; GFX10-CU: ; %bb.0: ; %entry 1503; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1504; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1505; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1506; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 1507; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 1508; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 1509; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1510; GFX10-CU-NEXT: s_endpgm 1511; 1512; SKIP-CACHE-INV-LABEL: local_workgroup_acquire_monotonic_cmpxchg: 1513; SKIP-CACHE-INV: ; %bb.0: ; %entry 1514; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 1515; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 1516; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 1517; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1518; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1519; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 1520; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 1521; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 1522; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1523; SKIP-CACHE-INV-NEXT: s_endpgm 1524; 1525; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acquire_monotonic_cmpxchg: 1526; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 1527; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1528; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1529; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 1530; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 1531; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 1532; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 1533; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1534; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 1535; 1536; GFX90A-TGSPLIT-LABEL: local_workgroup_acquire_monotonic_cmpxchg: 1537; GFX90A-TGSPLIT: ; %bb.0: ; %entry 1538; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1539; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1540; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 1541; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 1542; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 1543; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 1544; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 1545; GFX90A-TGSPLIT-NEXT: s_endpgm 1546 i32 addrspace(3)* %out, i32 %in, i32 %old) { 1547entry: 1548 %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 1549 %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") acquire monotonic 1550 ret void 1551} 1552 1553define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg( 1554; GFX6-LABEL: local_workgroup_release_monotonic_cmpxchg: 1555; GFX6: ; %bb.0: ; %entry 1556; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 1557; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 1558; GFX6-NEXT: s_mov_b32 m0, -1 1559; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1560; GFX6-NEXT: v_mov_b32_e32 v0, s2 1561; GFX6-NEXT: v_mov_b32_e32 v1, s1 1562; GFX6-NEXT: v_mov_b32_e32 v2, s0 1563; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1564; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 1565; GFX6-NEXT: s_endpgm 1566; 1567; GFX7-LABEL: local_workgroup_release_monotonic_cmpxchg: 1568; GFX7: ; %bb.0: ; %entry 1569; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1570; GFX7-NEXT: s_mov_b32 m0, -1 1571; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1572; GFX7-NEXT: v_mov_b32_e32 v0, s0 1573; GFX7-NEXT: v_mov_b32_e32 v1, s2 1574; GFX7-NEXT: v_mov_b32_e32 v2, s1 1575; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1576; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 1577; GFX7-NEXT: s_endpgm 1578; 1579; GFX10-WGP-LABEL: local_workgroup_release_monotonic_cmpxchg: 1580; GFX10-WGP: ; %bb.0: ; %entry 1581; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1582; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1583; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1584; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 1585; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 1586; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1587; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 1588; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 1589; GFX10-WGP-NEXT: s_endpgm 1590; 1591; GFX10-CU-LABEL: local_workgroup_release_monotonic_cmpxchg: 1592; GFX10-CU: ; %bb.0: ; %entry 1593; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1594; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1595; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1596; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 1597; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 1598; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1599; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 1600; GFX10-CU-NEXT: s_endpgm 1601; 1602; SKIP-CACHE-INV-LABEL: local_workgroup_release_monotonic_cmpxchg: 1603; SKIP-CACHE-INV: ; %bb.0: ; %entry 1604; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 1605; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 1606; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 1607; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1608; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1609; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 1610; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 1611; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1612; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 1613; SKIP-CACHE-INV-NEXT: s_endpgm 1614; 1615; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_release_monotonic_cmpxchg: 1616; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 1617; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1618; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1619; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 1620; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 1621; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 1622; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1623; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 1624; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 1625; 1626; GFX90A-TGSPLIT-LABEL: local_workgroup_release_monotonic_cmpxchg: 1627; GFX90A-TGSPLIT: ; %bb.0: ; %entry 1628; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1629; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1630; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 1631; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 1632; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 1633; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1634; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 1635; GFX90A-TGSPLIT-NEXT: s_endpgm 1636 i32 addrspace(3)* %out, i32 %in, i32 %old) { 1637entry: 1638 %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 1639 %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") release monotonic 1640 ret void 1641} 1642 1643define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg( 1644; GFX6-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg: 1645; GFX6: ; %bb.0: ; %entry 1646; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 1647; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 1648; GFX6-NEXT: s_mov_b32 m0, -1 1649; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1650; GFX6-NEXT: v_mov_b32_e32 v0, s2 1651; GFX6-NEXT: v_mov_b32_e32 v1, s1 1652; GFX6-NEXT: v_mov_b32_e32 v2, s0 1653; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1654; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 1655; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1656; GFX6-NEXT: s_endpgm 1657; 1658; GFX7-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg: 1659; GFX7: ; %bb.0: ; %entry 1660; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1661; GFX7-NEXT: s_mov_b32 m0, -1 1662; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1663; GFX7-NEXT: v_mov_b32_e32 v0, s0 1664; GFX7-NEXT: v_mov_b32_e32 v1, s2 1665; GFX7-NEXT: v_mov_b32_e32 v2, s1 1666; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1667; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 1668; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1669; GFX7-NEXT: s_endpgm 1670; 1671; GFX10-WGP-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg: 1672; GFX10-WGP: ; %bb.0: ; %entry 1673; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1674; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1675; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1676; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 1677; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 1678; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1679; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 1680; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 1681; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1682; GFX10-WGP-NEXT: buffer_gl0_inv 1683; GFX10-WGP-NEXT: s_endpgm 1684; 1685; GFX10-CU-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg: 1686; GFX10-CU: ; %bb.0: ; %entry 1687; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1688; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1689; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1690; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 1691; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 1692; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1693; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 1694; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1695; GFX10-CU-NEXT: s_endpgm 1696; 1697; SKIP-CACHE-INV-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg: 1698; SKIP-CACHE-INV: ; %bb.0: ; %entry 1699; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 1700; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 1701; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 1702; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1703; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1704; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 1705; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 1706; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1707; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 1708; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1709; SKIP-CACHE-INV-NEXT: s_endpgm 1710; 1711; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg: 1712; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 1713; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1714; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1715; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 1716; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 1717; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 1718; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1719; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 1720; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1721; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 1722; 1723; GFX90A-TGSPLIT-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg: 1724; GFX90A-TGSPLIT: ; %bb.0: ; %entry 1725; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1726; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1727; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 1728; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 1729; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 1730; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1731; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 1732; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 1733; GFX90A-TGSPLIT-NEXT: s_endpgm 1734 i32 addrspace(3)* %out, i32 %in, i32 %old) { 1735entry: 1736 %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 1737 %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel monotonic 1738 ret void 1739} 1740 1741define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg( 1742; GFX6-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg: 1743; GFX6: ; %bb.0: ; %entry 1744; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 1745; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 1746; GFX6-NEXT: s_mov_b32 m0, -1 1747; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1748; GFX6-NEXT: v_mov_b32_e32 v0, s2 1749; GFX6-NEXT: v_mov_b32_e32 v1, s1 1750; GFX6-NEXT: v_mov_b32_e32 v2, s0 1751; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1752; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 1753; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1754; GFX6-NEXT: s_endpgm 1755; 1756; GFX7-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg: 1757; GFX7: ; %bb.0: ; %entry 1758; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1759; GFX7-NEXT: s_mov_b32 m0, -1 1760; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1761; GFX7-NEXT: v_mov_b32_e32 v0, s0 1762; GFX7-NEXT: v_mov_b32_e32 v1, s2 1763; GFX7-NEXT: v_mov_b32_e32 v2, s1 1764; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1765; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 1766; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1767; GFX7-NEXT: s_endpgm 1768; 1769; GFX10-WGP-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg: 1770; GFX10-WGP: ; %bb.0: ; %entry 1771; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1772; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1773; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1774; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 1775; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 1776; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1777; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 1778; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 1779; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1780; GFX10-WGP-NEXT: buffer_gl0_inv 1781; GFX10-WGP-NEXT: s_endpgm 1782; 1783; GFX10-CU-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg: 1784; GFX10-CU: ; %bb.0: ; %entry 1785; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1786; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1787; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1788; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 1789; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 1790; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1791; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 1792; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1793; GFX10-CU-NEXT: s_endpgm 1794; 1795; SKIP-CACHE-INV-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg: 1796; SKIP-CACHE-INV: ; %bb.0: ; %entry 1797; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 1798; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 1799; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 1800; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1801; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1802; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 1803; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 1804; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1805; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 1806; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1807; SKIP-CACHE-INV-NEXT: s_endpgm 1808; 1809; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg: 1810; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 1811; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1812; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1813; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 1814; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 1815; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 1816; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1817; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 1818; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1819; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 1820; 1821; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg: 1822; GFX90A-TGSPLIT: ; %bb.0: ; %entry 1823; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1824; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1825; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 1826; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 1827; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 1828; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1829; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 1830; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 1831; GFX90A-TGSPLIT-NEXT: s_endpgm 1832 i32 addrspace(3)* %out, i32 %in, i32 %old) { 1833entry: 1834 %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 1835 %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst monotonic 1836 ret void 1837} 1838 1839define amdgpu_kernel void @local_workgroup_acquire_acquire_cmpxchg( 1840; GFX6-LABEL: local_workgroup_acquire_acquire_cmpxchg: 1841; GFX6: ; %bb.0: ; %entry 1842; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 1843; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 1844; GFX6-NEXT: s_mov_b32 m0, -1 1845; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1846; GFX6-NEXT: v_mov_b32_e32 v0, s2 1847; GFX6-NEXT: v_mov_b32_e32 v1, s1 1848; GFX6-NEXT: v_mov_b32_e32 v2, s0 1849; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 1850; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1851; GFX6-NEXT: s_endpgm 1852; 1853; GFX7-LABEL: local_workgroup_acquire_acquire_cmpxchg: 1854; GFX7: ; %bb.0: ; %entry 1855; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1856; GFX7-NEXT: s_mov_b32 m0, -1 1857; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1858; GFX7-NEXT: v_mov_b32_e32 v0, s0 1859; GFX7-NEXT: v_mov_b32_e32 v1, s2 1860; GFX7-NEXT: v_mov_b32_e32 v2, s1 1861; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 1862; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1863; GFX7-NEXT: s_endpgm 1864; 1865; GFX10-WGP-LABEL: local_workgroup_acquire_acquire_cmpxchg: 1866; GFX10-WGP: ; %bb.0: ; %entry 1867; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1868; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1869; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1870; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 1871; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 1872; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 1873; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1874; GFX10-WGP-NEXT: buffer_gl0_inv 1875; GFX10-WGP-NEXT: s_endpgm 1876; 1877; GFX10-CU-LABEL: local_workgroup_acquire_acquire_cmpxchg: 1878; GFX10-CU: ; %bb.0: ; %entry 1879; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1880; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1881; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1882; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 1883; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 1884; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 1885; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1886; GFX10-CU-NEXT: s_endpgm 1887; 1888; SKIP-CACHE-INV-LABEL: local_workgroup_acquire_acquire_cmpxchg: 1889; SKIP-CACHE-INV: ; %bb.0: ; %entry 1890; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 1891; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 1892; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 1893; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1894; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1895; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 1896; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 1897; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 1898; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1899; SKIP-CACHE-INV-NEXT: s_endpgm 1900; 1901; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acquire_acquire_cmpxchg: 1902; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 1903; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1904; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1905; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 1906; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 1907; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 1908; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 1909; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1910; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 1911; 1912; GFX90A-TGSPLIT-LABEL: local_workgroup_acquire_acquire_cmpxchg: 1913; GFX90A-TGSPLIT: ; %bb.0: ; %entry 1914; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1915; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1916; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 1917; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 1918; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 1919; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 1920; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 1921; GFX90A-TGSPLIT-NEXT: s_endpgm 1922 i32 addrspace(3)* %out, i32 %in, i32 %old) { 1923entry: 1924 %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 1925 %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") acquire acquire 1926 ret void 1927} 1928 1929define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg( 1930; GFX6-LABEL: local_workgroup_release_acquire_cmpxchg: 1931; GFX6: ; %bb.0: ; %entry 1932; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 1933; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 1934; GFX6-NEXT: s_mov_b32 m0, -1 1935; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1936; GFX6-NEXT: v_mov_b32_e32 v0, s2 1937; GFX6-NEXT: v_mov_b32_e32 v1, s1 1938; GFX6-NEXT: v_mov_b32_e32 v2, s0 1939; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1940; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 1941; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1942; GFX6-NEXT: s_endpgm 1943; 1944; GFX7-LABEL: local_workgroup_release_acquire_cmpxchg: 1945; GFX7: ; %bb.0: ; %entry 1946; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1947; GFX7-NEXT: s_mov_b32 m0, -1 1948; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1949; GFX7-NEXT: v_mov_b32_e32 v0, s0 1950; GFX7-NEXT: v_mov_b32_e32 v1, s2 1951; GFX7-NEXT: v_mov_b32_e32 v2, s1 1952; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1953; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 1954; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1955; GFX7-NEXT: s_endpgm 1956; 1957; GFX10-WGP-LABEL: local_workgroup_release_acquire_cmpxchg: 1958; GFX10-WGP: ; %bb.0: ; %entry 1959; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1960; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1961; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1962; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 1963; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 1964; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1965; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 1966; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 1967; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1968; GFX10-WGP-NEXT: buffer_gl0_inv 1969; GFX10-WGP-NEXT: s_endpgm 1970; 1971; GFX10-CU-LABEL: local_workgroup_release_acquire_cmpxchg: 1972; GFX10-CU: ; %bb.0: ; %entry 1973; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1974; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1975; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1976; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 1977; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 1978; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1979; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 1980; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1981; GFX10-CU-NEXT: s_endpgm 1982; 1983; SKIP-CACHE-INV-LABEL: local_workgroup_release_acquire_cmpxchg: 1984; SKIP-CACHE-INV: ; %bb.0: ; %entry 1985; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 1986; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 1987; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 1988; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1989; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1990; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 1991; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 1992; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1993; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 1994; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1995; SKIP-CACHE-INV-NEXT: s_endpgm 1996; 1997; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_release_acquire_cmpxchg: 1998; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 1999; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2000; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2001; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 2002; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 2003; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 2004; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2005; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 2006; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2007; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 2008; 2009; GFX90A-TGSPLIT-LABEL: local_workgroup_release_acquire_cmpxchg: 2010; GFX90A-TGSPLIT: ; %bb.0: ; %entry 2011; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2012; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2013; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 2014; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 2015; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 2016; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2017; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 2018; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 2019; GFX90A-TGSPLIT-NEXT: s_endpgm 2020 i32 addrspace(3)* %out, i32 %in, i32 %old) { 2021entry: 2022 %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 2023 %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") release acquire 2024 ret void 2025} 2026 2027define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg( 2028; GFX6-LABEL: local_workgroup_acq_rel_acquire_cmpxchg: 2029; GFX6: ; %bb.0: ; %entry 2030; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 2031; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 2032; GFX6-NEXT: s_mov_b32 m0, -1 2033; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2034; GFX6-NEXT: v_mov_b32_e32 v0, s2 2035; GFX6-NEXT: v_mov_b32_e32 v1, s1 2036; GFX6-NEXT: v_mov_b32_e32 v2, s0 2037; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2038; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 2039; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2040; GFX6-NEXT: s_endpgm 2041; 2042; GFX7-LABEL: local_workgroup_acq_rel_acquire_cmpxchg: 2043; GFX7: ; %bb.0: ; %entry 2044; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2045; GFX7-NEXT: s_mov_b32 m0, -1 2046; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2047; GFX7-NEXT: v_mov_b32_e32 v0, s0 2048; GFX7-NEXT: v_mov_b32_e32 v1, s2 2049; GFX7-NEXT: v_mov_b32_e32 v2, s1 2050; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2051; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 2052; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2053; GFX7-NEXT: s_endpgm 2054; 2055; GFX10-WGP-LABEL: local_workgroup_acq_rel_acquire_cmpxchg: 2056; GFX10-WGP: ; %bb.0: ; %entry 2057; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2058; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2059; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 2060; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 2061; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 2062; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2063; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 2064; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 2065; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2066; GFX10-WGP-NEXT: buffer_gl0_inv 2067; GFX10-WGP-NEXT: s_endpgm 2068; 2069; GFX10-CU-LABEL: local_workgroup_acq_rel_acquire_cmpxchg: 2070; GFX10-CU: ; %bb.0: ; %entry 2071; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2072; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2073; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 2074; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 2075; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 2076; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2077; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 2078; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2079; GFX10-CU-NEXT: s_endpgm 2080; 2081; SKIP-CACHE-INV-LABEL: local_workgroup_acq_rel_acquire_cmpxchg: 2082; SKIP-CACHE-INV: ; %bb.0: ; %entry 2083; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 2084; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 2085; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 2086; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2087; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 2088; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 2089; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 2090; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2091; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 2092; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2093; SKIP-CACHE-INV-NEXT: s_endpgm 2094; 2095; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_acquire_cmpxchg: 2096; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 2097; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2098; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2099; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 2100; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 2101; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 2102; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2103; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 2104; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2105; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 2106; 2107; GFX90A-TGSPLIT-LABEL: local_workgroup_acq_rel_acquire_cmpxchg: 2108; GFX90A-TGSPLIT: ; %bb.0: ; %entry 2109; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2110; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2111; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 2112; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 2113; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 2114; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2115; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 2116; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 2117; GFX90A-TGSPLIT-NEXT: s_endpgm 2118 i32 addrspace(3)* %out, i32 %in, i32 %old) { 2119entry: 2120 %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 2121 %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel acquire 2122 ret void 2123} 2124 2125define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg( 2126; GFX6-LABEL: local_workgroup_seq_cst_acquire_cmpxchg: 2127; GFX6: ; %bb.0: ; %entry 2128; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 2129; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 2130; GFX6-NEXT: s_mov_b32 m0, -1 2131; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2132; GFX6-NEXT: v_mov_b32_e32 v0, s2 2133; GFX6-NEXT: v_mov_b32_e32 v1, s1 2134; GFX6-NEXT: v_mov_b32_e32 v2, s0 2135; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2136; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 2137; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2138; GFX6-NEXT: s_endpgm 2139; 2140; GFX7-LABEL: local_workgroup_seq_cst_acquire_cmpxchg: 2141; GFX7: ; %bb.0: ; %entry 2142; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2143; GFX7-NEXT: s_mov_b32 m0, -1 2144; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2145; GFX7-NEXT: v_mov_b32_e32 v0, s0 2146; GFX7-NEXT: v_mov_b32_e32 v1, s2 2147; GFX7-NEXT: v_mov_b32_e32 v2, s1 2148; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2149; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 2150; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2151; GFX7-NEXT: s_endpgm 2152; 2153; GFX10-WGP-LABEL: local_workgroup_seq_cst_acquire_cmpxchg: 2154; GFX10-WGP: ; %bb.0: ; %entry 2155; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2156; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2157; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 2158; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 2159; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 2160; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2161; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 2162; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 2163; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2164; GFX10-WGP-NEXT: buffer_gl0_inv 2165; GFX10-WGP-NEXT: s_endpgm 2166; 2167; GFX10-CU-LABEL: local_workgroup_seq_cst_acquire_cmpxchg: 2168; GFX10-CU: ; %bb.0: ; %entry 2169; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2170; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2171; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 2172; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 2173; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 2174; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2175; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 2176; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2177; GFX10-CU-NEXT: s_endpgm 2178; 2179; SKIP-CACHE-INV-LABEL: local_workgroup_seq_cst_acquire_cmpxchg: 2180; SKIP-CACHE-INV: ; %bb.0: ; %entry 2181; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 2182; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 2183; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 2184; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2185; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 2186; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 2187; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 2188; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2189; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 2190; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2191; SKIP-CACHE-INV-NEXT: s_endpgm 2192; 2193; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_acquire_cmpxchg: 2194; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 2195; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2196; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2197; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 2198; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 2199; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 2200; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2201; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 2202; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2203; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 2204; 2205; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_acquire_cmpxchg: 2206; GFX90A-TGSPLIT: ; %bb.0: ; %entry 2207; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2208; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2209; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 2210; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 2211; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 2212; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2213; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 2214; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 2215; GFX90A-TGSPLIT-NEXT: s_endpgm 2216 i32 addrspace(3)* %out, i32 %in, i32 %old) { 2217entry: 2218 %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 2219 %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst acquire 2220 ret void 2221} 2222 2223define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg( 2224; GFX6-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: 2225; GFX6: ; %bb.0: ; %entry 2226; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 2227; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 2228; GFX6-NEXT: s_mov_b32 m0, -1 2229; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2230; GFX6-NEXT: v_mov_b32_e32 v0, s2 2231; GFX6-NEXT: v_mov_b32_e32 v1, s1 2232; GFX6-NEXT: v_mov_b32_e32 v2, s0 2233; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2234; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 2235; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2236; GFX6-NEXT: s_endpgm 2237; 2238; GFX7-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: 2239; GFX7: ; %bb.0: ; %entry 2240; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2241; GFX7-NEXT: s_mov_b32 m0, -1 2242; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2243; GFX7-NEXT: v_mov_b32_e32 v0, s0 2244; GFX7-NEXT: v_mov_b32_e32 v1, s2 2245; GFX7-NEXT: v_mov_b32_e32 v2, s1 2246; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2247; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 2248; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2249; GFX7-NEXT: s_endpgm 2250; 2251; GFX10-WGP-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: 2252; GFX10-WGP: ; %bb.0: ; %entry 2253; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2254; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2255; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 2256; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 2257; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 2258; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2259; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 2260; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 2261; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2262; GFX10-WGP-NEXT: buffer_gl0_inv 2263; GFX10-WGP-NEXT: s_endpgm 2264; 2265; GFX10-CU-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: 2266; GFX10-CU: ; %bb.0: ; %entry 2267; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2268; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2269; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 2270; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 2271; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 2272; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2273; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 2274; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2275; GFX10-CU-NEXT: s_endpgm 2276; 2277; SKIP-CACHE-INV-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: 2278; SKIP-CACHE-INV: ; %bb.0: ; %entry 2279; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 2280; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 2281; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 2282; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2283; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 2284; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 2285; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 2286; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2287; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 2288; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2289; SKIP-CACHE-INV-NEXT: s_endpgm 2290; 2291; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: 2292; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 2293; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2294; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2295; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 2296; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 2297; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 2298; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2299; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 2300; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2301; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 2302; 2303; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: 2304; GFX90A-TGSPLIT: ; %bb.0: ; %entry 2305; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2306; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2307; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 2308; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 2309; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 2310; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2311; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 2312; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 2313; GFX90A-TGSPLIT-NEXT: s_endpgm 2314 i32 addrspace(3)* %out, i32 %in, i32 %old) { 2315entry: 2316 %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 2317 %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst seq_cst 2318 ret void 2319} 2320 2321define amdgpu_kernel void @local_workgroup_acquire_monotonic_ret_cmpxchg( 2322; GFX6-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg: 2323; GFX6: ; %bb.0: ; %entry 2324; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 2325; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 2326; GFX6-NEXT: s_mov_b32 m0, -1 2327; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2328; GFX6-NEXT: v_mov_b32_e32 v0, s2 2329; GFX6-NEXT: v_mov_b32_e32 v1, s1 2330; GFX6-NEXT: v_mov_b32_e32 v2, s0 2331; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 2332; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2333; GFX6-NEXT: ds_write_b32 v0, v1 2334; GFX6-NEXT: s_endpgm 2335; 2336; GFX7-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg: 2337; GFX7: ; %bb.0: ; %entry 2338; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2339; GFX7-NEXT: s_mov_b32 m0, -1 2340; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2341; GFX7-NEXT: v_mov_b32_e32 v0, s0 2342; GFX7-NEXT: v_mov_b32_e32 v1, s2 2343; GFX7-NEXT: v_mov_b32_e32 v2, s1 2344; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 2345; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2346; GFX7-NEXT: ds_write_b32 v0, v1 2347; GFX7-NEXT: s_endpgm 2348; 2349; GFX10-WGP-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg: 2350; GFX10-WGP: ; %bb.0: ; %entry 2351; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2352; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2353; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 2354; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 2355; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 2356; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 2357; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2358; GFX10-WGP-NEXT: buffer_gl0_inv 2359; GFX10-WGP-NEXT: ds_write_b32 v0, v1 2360; GFX10-WGP-NEXT: s_endpgm 2361; 2362; GFX10-CU-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg: 2363; GFX10-CU: ; %bb.0: ; %entry 2364; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2365; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2366; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 2367; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 2368; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 2369; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 2370; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2371; GFX10-CU-NEXT: ds_write_b32 v0, v1 2372; GFX10-CU-NEXT: s_endpgm 2373; 2374; SKIP-CACHE-INV-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg: 2375; SKIP-CACHE-INV: ; %bb.0: ; %entry 2376; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 2377; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 2378; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 2379; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2380; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 2381; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 2382; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 2383; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 2384; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2385; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 2386; SKIP-CACHE-INV-NEXT: s_endpgm 2387; 2388; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg: 2389; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 2390; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2391; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2392; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 2393; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 2394; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 2395; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 2396; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2397; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 2398; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 2399; 2400; GFX90A-TGSPLIT-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg: 2401; GFX90A-TGSPLIT: ; %bb.0: ; %entry 2402; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2403; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2404; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 2405; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 2406; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 2407; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 2408; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 2409; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2410; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 2411; GFX90A-TGSPLIT-NEXT: s_endpgm 2412 i32 addrspace(3)* %out, i32 %in, i32 %old) { 2413entry: 2414 %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 2415 %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") acquire monotonic 2416 %val0 = extractvalue { i32, i1 } %val, 0 2417 store i32 %val0, i32 addrspace(3)* %out, align 4 2418 ret void 2419} 2420 2421define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg( 2422; GFX6-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg: 2423; GFX6: ; %bb.0: ; %entry 2424; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 2425; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 2426; GFX6-NEXT: s_mov_b32 m0, -1 2427; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2428; GFX6-NEXT: v_mov_b32_e32 v0, s2 2429; GFX6-NEXT: v_mov_b32_e32 v1, s1 2430; GFX6-NEXT: v_mov_b32_e32 v2, s0 2431; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2432; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 2433; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2434; GFX6-NEXT: ds_write_b32 v0, v1 2435; GFX6-NEXT: s_endpgm 2436; 2437; GFX7-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg: 2438; GFX7: ; %bb.0: ; %entry 2439; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2440; GFX7-NEXT: s_mov_b32 m0, -1 2441; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2442; GFX7-NEXT: v_mov_b32_e32 v0, s0 2443; GFX7-NEXT: v_mov_b32_e32 v1, s2 2444; GFX7-NEXT: v_mov_b32_e32 v2, s1 2445; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2446; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 2447; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2448; GFX7-NEXT: ds_write_b32 v0, v1 2449; GFX7-NEXT: s_endpgm 2450; 2451; GFX10-WGP-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg: 2452; GFX10-WGP: ; %bb.0: ; %entry 2453; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2454; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2455; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 2456; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 2457; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 2458; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2459; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 2460; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 2461; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2462; GFX10-WGP-NEXT: buffer_gl0_inv 2463; GFX10-WGP-NEXT: ds_write_b32 v0, v1 2464; GFX10-WGP-NEXT: s_endpgm 2465; 2466; GFX10-CU-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg: 2467; GFX10-CU: ; %bb.0: ; %entry 2468; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2469; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2470; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 2471; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 2472; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 2473; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2474; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 2475; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2476; GFX10-CU-NEXT: ds_write_b32 v0, v1 2477; GFX10-CU-NEXT: s_endpgm 2478; 2479; SKIP-CACHE-INV-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg: 2480; SKIP-CACHE-INV: ; %bb.0: ; %entry 2481; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 2482; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 2483; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 2484; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2485; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 2486; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 2487; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 2488; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2489; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 2490; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2491; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 2492; SKIP-CACHE-INV-NEXT: s_endpgm 2493; 2494; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg: 2495; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 2496; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2497; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2498; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 2499; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 2500; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 2501; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2502; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 2503; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2504; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 2505; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 2506; 2507; GFX90A-TGSPLIT-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg: 2508; GFX90A-TGSPLIT: ; %bb.0: ; %entry 2509; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2510; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2511; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 2512; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 2513; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 2514; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2515; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 2516; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 2517; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2518; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 2519; GFX90A-TGSPLIT-NEXT: s_endpgm 2520 i32 addrspace(3)* %out, i32 %in, i32 %old) { 2521entry: 2522 %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 2523 %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel monotonic 2524 %val0 = extractvalue { i32, i1 } %val, 0 2525 store i32 %val0, i32 addrspace(3)* %out, align 4 2526 ret void 2527} 2528 2529define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg( 2530; GFX6-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg: 2531; GFX6: ; %bb.0: ; %entry 2532; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 2533; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 2534; GFX6-NEXT: s_mov_b32 m0, -1 2535; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2536; GFX6-NEXT: v_mov_b32_e32 v0, s2 2537; GFX6-NEXT: v_mov_b32_e32 v1, s1 2538; GFX6-NEXT: v_mov_b32_e32 v2, s0 2539; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2540; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 2541; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2542; GFX6-NEXT: ds_write_b32 v0, v1 2543; GFX6-NEXT: s_endpgm 2544; 2545; GFX7-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg: 2546; GFX7: ; %bb.0: ; %entry 2547; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2548; GFX7-NEXT: s_mov_b32 m0, -1 2549; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2550; GFX7-NEXT: v_mov_b32_e32 v0, s0 2551; GFX7-NEXT: v_mov_b32_e32 v1, s2 2552; GFX7-NEXT: v_mov_b32_e32 v2, s1 2553; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2554; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 2555; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2556; GFX7-NEXT: ds_write_b32 v0, v1 2557; GFX7-NEXT: s_endpgm 2558; 2559; GFX10-WGP-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg: 2560; GFX10-WGP: ; %bb.0: ; %entry 2561; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2562; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2563; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 2564; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 2565; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 2566; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2567; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 2568; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 2569; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2570; GFX10-WGP-NEXT: buffer_gl0_inv 2571; GFX10-WGP-NEXT: ds_write_b32 v0, v1 2572; GFX10-WGP-NEXT: s_endpgm 2573; 2574; GFX10-CU-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg: 2575; GFX10-CU: ; %bb.0: ; %entry 2576; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2577; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2578; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 2579; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 2580; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 2581; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2582; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 2583; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2584; GFX10-CU-NEXT: ds_write_b32 v0, v1 2585; GFX10-CU-NEXT: s_endpgm 2586; 2587; SKIP-CACHE-INV-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg: 2588; SKIP-CACHE-INV: ; %bb.0: ; %entry 2589; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 2590; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 2591; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 2592; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2593; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 2594; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 2595; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 2596; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2597; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 2598; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2599; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 2600; SKIP-CACHE-INV-NEXT: s_endpgm 2601; 2602; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg: 2603; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 2604; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2605; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2606; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 2607; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 2608; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 2609; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2610; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 2611; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2612; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 2613; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 2614; 2615; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg: 2616; GFX90A-TGSPLIT: ; %bb.0: ; %entry 2617; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2618; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2619; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 2620; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 2621; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 2622; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2623; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 2624; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 2625; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2626; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 2627; GFX90A-TGSPLIT-NEXT: s_endpgm 2628 i32 addrspace(3)* %out, i32 %in, i32 %old) { 2629entry: 2630 %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 2631 %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst monotonic 2632 %val0 = extractvalue { i32, i1 } %val, 0 2633 store i32 %val0, i32 addrspace(3)* %out, align 4 2634 ret void 2635} 2636 2637define amdgpu_kernel void @local_workgroup_acquire_acquire_ret_cmpxchg( 2638; GFX6-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg: 2639; GFX6: ; %bb.0: ; %entry 2640; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 2641; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 2642; GFX6-NEXT: s_mov_b32 m0, -1 2643; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2644; GFX6-NEXT: v_mov_b32_e32 v0, s2 2645; GFX6-NEXT: v_mov_b32_e32 v1, s1 2646; GFX6-NEXT: v_mov_b32_e32 v2, s0 2647; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 2648; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2649; GFX6-NEXT: ds_write_b32 v0, v1 2650; GFX6-NEXT: s_endpgm 2651; 2652; GFX7-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg: 2653; GFX7: ; %bb.0: ; %entry 2654; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2655; GFX7-NEXT: s_mov_b32 m0, -1 2656; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2657; GFX7-NEXT: v_mov_b32_e32 v0, s0 2658; GFX7-NEXT: v_mov_b32_e32 v1, s2 2659; GFX7-NEXT: v_mov_b32_e32 v2, s1 2660; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 2661; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2662; GFX7-NEXT: ds_write_b32 v0, v1 2663; GFX7-NEXT: s_endpgm 2664; 2665; GFX10-WGP-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg: 2666; GFX10-WGP: ; %bb.0: ; %entry 2667; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2668; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2669; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 2670; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 2671; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 2672; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 2673; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2674; GFX10-WGP-NEXT: buffer_gl0_inv 2675; GFX10-WGP-NEXT: ds_write_b32 v0, v1 2676; GFX10-WGP-NEXT: s_endpgm 2677; 2678; GFX10-CU-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg: 2679; GFX10-CU: ; %bb.0: ; %entry 2680; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2681; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2682; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 2683; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 2684; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 2685; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 2686; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2687; GFX10-CU-NEXT: ds_write_b32 v0, v1 2688; GFX10-CU-NEXT: s_endpgm 2689; 2690; SKIP-CACHE-INV-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg: 2691; SKIP-CACHE-INV: ; %bb.0: ; %entry 2692; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 2693; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 2694; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 2695; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2696; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 2697; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 2698; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 2699; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 2700; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2701; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 2702; SKIP-CACHE-INV-NEXT: s_endpgm 2703; 2704; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg: 2705; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 2706; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2707; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2708; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 2709; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 2710; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 2711; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 2712; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2713; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 2714; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 2715; 2716; GFX90A-TGSPLIT-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg: 2717; GFX90A-TGSPLIT: ; %bb.0: ; %entry 2718; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2719; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2720; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 2721; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 2722; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 2723; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 2724; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 2725; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2726; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 2727; GFX90A-TGSPLIT-NEXT: s_endpgm 2728 i32 addrspace(3)* %out, i32 %in, i32 %old) { 2729entry: 2730 %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 2731 %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") acquire acquire 2732 %val0 = extractvalue { i32, i1 } %val, 0 2733 store i32 %val0, i32 addrspace(3)* %out, align 4 2734 ret void 2735} 2736 2737define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg( 2738; GFX6-LABEL: local_workgroup_release_acquire_ret_cmpxchg: 2739; GFX6: ; %bb.0: ; %entry 2740; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 2741; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 2742; GFX6-NEXT: s_mov_b32 m0, -1 2743; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2744; GFX6-NEXT: v_mov_b32_e32 v0, s2 2745; GFX6-NEXT: v_mov_b32_e32 v1, s1 2746; GFX6-NEXT: v_mov_b32_e32 v2, s0 2747; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2748; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 2749; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2750; GFX6-NEXT: ds_write_b32 v0, v1 2751; GFX6-NEXT: s_endpgm 2752; 2753; GFX7-LABEL: local_workgroup_release_acquire_ret_cmpxchg: 2754; GFX7: ; %bb.0: ; %entry 2755; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2756; GFX7-NEXT: s_mov_b32 m0, -1 2757; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2758; GFX7-NEXT: v_mov_b32_e32 v0, s0 2759; GFX7-NEXT: v_mov_b32_e32 v1, s2 2760; GFX7-NEXT: v_mov_b32_e32 v2, s1 2761; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2762; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 2763; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2764; GFX7-NEXT: ds_write_b32 v0, v1 2765; GFX7-NEXT: s_endpgm 2766; 2767; GFX10-WGP-LABEL: local_workgroup_release_acquire_ret_cmpxchg: 2768; GFX10-WGP: ; %bb.0: ; %entry 2769; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2770; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2771; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 2772; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 2773; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 2774; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2775; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 2776; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 2777; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2778; GFX10-WGP-NEXT: buffer_gl0_inv 2779; GFX10-WGP-NEXT: ds_write_b32 v0, v1 2780; GFX10-WGP-NEXT: s_endpgm 2781; 2782; GFX10-CU-LABEL: local_workgroup_release_acquire_ret_cmpxchg: 2783; GFX10-CU: ; %bb.0: ; %entry 2784; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2785; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2786; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 2787; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 2788; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 2789; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2790; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 2791; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2792; GFX10-CU-NEXT: ds_write_b32 v0, v1 2793; GFX10-CU-NEXT: s_endpgm 2794; 2795; SKIP-CACHE-INV-LABEL: local_workgroup_release_acquire_ret_cmpxchg: 2796; SKIP-CACHE-INV: ; %bb.0: ; %entry 2797; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 2798; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 2799; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 2800; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2801; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 2802; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 2803; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 2804; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2805; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 2806; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2807; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 2808; SKIP-CACHE-INV-NEXT: s_endpgm 2809; 2810; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_release_acquire_ret_cmpxchg: 2811; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 2812; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2813; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2814; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 2815; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 2816; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 2817; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2818; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 2819; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2820; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 2821; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 2822; 2823; GFX90A-TGSPLIT-LABEL: local_workgroup_release_acquire_ret_cmpxchg: 2824; GFX90A-TGSPLIT: ; %bb.0: ; %entry 2825; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2826; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2827; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 2828; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 2829; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 2830; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2831; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 2832; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 2833; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2834; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 2835; GFX90A-TGSPLIT-NEXT: s_endpgm 2836 i32 addrspace(3)* %out, i32 %in, i32 %old) { 2837entry: 2838 %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 2839 %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") release acquire 2840 %val0 = extractvalue { i32, i1 } %val, 0 2841 store i32 %val0, i32 addrspace(3)* %out, align 4 2842 ret void 2843} 2844 2845define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg( 2846; GFX6-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg: 2847; GFX6: ; %bb.0: ; %entry 2848; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 2849; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 2850; GFX6-NEXT: s_mov_b32 m0, -1 2851; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2852; GFX6-NEXT: v_mov_b32_e32 v0, s2 2853; GFX6-NEXT: v_mov_b32_e32 v1, s1 2854; GFX6-NEXT: v_mov_b32_e32 v2, s0 2855; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2856; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 2857; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2858; GFX6-NEXT: ds_write_b32 v0, v1 2859; GFX6-NEXT: s_endpgm 2860; 2861; GFX7-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg: 2862; GFX7: ; %bb.0: ; %entry 2863; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2864; GFX7-NEXT: s_mov_b32 m0, -1 2865; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2866; GFX7-NEXT: v_mov_b32_e32 v0, s0 2867; GFX7-NEXT: v_mov_b32_e32 v1, s2 2868; GFX7-NEXT: v_mov_b32_e32 v2, s1 2869; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2870; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 2871; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2872; GFX7-NEXT: ds_write_b32 v0, v1 2873; GFX7-NEXT: s_endpgm 2874; 2875; GFX10-WGP-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg: 2876; GFX10-WGP: ; %bb.0: ; %entry 2877; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2878; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2879; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 2880; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 2881; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 2882; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2883; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 2884; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 2885; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2886; GFX10-WGP-NEXT: buffer_gl0_inv 2887; GFX10-WGP-NEXT: ds_write_b32 v0, v1 2888; GFX10-WGP-NEXT: s_endpgm 2889; 2890; GFX10-CU-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg: 2891; GFX10-CU: ; %bb.0: ; %entry 2892; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2893; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2894; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 2895; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 2896; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 2897; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2898; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 2899; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2900; GFX10-CU-NEXT: ds_write_b32 v0, v1 2901; GFX10-CU-NEXT: s_endpgm 2902; 2903; SKIP-CACHE-INV-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg: 2904; SKIP-CACHE-INV: ; %bb.0: ; %entry 2905; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 2906; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 2907; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 2908; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2909; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 2910; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 2911; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 2912; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2913; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 2914; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2915; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 2916; SKIP-CACHE-INV-NEXT: s_endpgm 2917; 2918; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg: 2919; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 2920; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2921; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2922; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 2923; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 2924; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 2925; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2926; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 2927; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2928; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 2929; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 2930; 2931; GFX90A-TGSPLIT-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg: 2932; GFX90A-TGSPLIT: ; %bb.0: ; %entry 2933; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2934; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2935; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 2936; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 2937; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 2938; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2939; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 2940; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 2941; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2942; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 2943; GFX90A-TGSPLIT-NEXT: s_endpgm 2944 i32 addrspace(3)* %out, i32 %in, i32 %old) { 2945entry: 2946 %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 2947 %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel acquire 2948 %val0 = extractvalue { i32, i1 } %val, 0 2949 store i32 %val0, i32 addrspace(3)* %out, align 4 2950 ret void 2951} 2952 2953define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg( 2954; GFX6-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg: 2955; GFX6: ; %bb.0: ; %entry 2956; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 2957; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 2958; GFX6-NEXT: s_mov_b32 m0, -1 2959; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2960; GFX6-NEXT: v_mov_b32_e32 v0, s2 2961; GFX6-NEXT: v_mov_b32_e32 v1, s1 2962; GFX6-NEXT: v_mov_b32_e32 v2, s0 2963; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2964; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 2965; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2966; GFX6-NEXT: ds_write_b32 v0, v1 2967; GFX6-NEXT: s_endpgm 2968; 2969; GFX7-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg: 2970; GFX7: ; %bb.0: ; %entry 2971; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2972; GFX7-NEXT: s_mov_b32 m0, -1 2973; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2974; GFX7-NEXT: v_mov_b32_e32 v0, s0 2975; GFX7-NEXT: v_mov_b32_e32 v1, s2 2976; GFX7-NEXT: v_mov_b32_e32 v2, s1 2977; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2978; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 2979; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2980; GFX7-NEXT: ds_write_b32 v0, v1 2981; GFX7-NEXT: s_endpgm 2982; 2983; GFX10-WGP-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg: 2984; GFX10-WGP: ; %bb.0: ; %entry 2985; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2986; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2987; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 2988; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 2989; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 2990; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2991; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 2992; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 2993; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2994; GFX10-WGP-NEXT: buffer_gl0_inv 2995; GFX10-WGP-NEXT: ds_write_b32 v0, v1 2996; GFX10-WGP-NEXT: s_endpgm 2997; 2998; GFX10-CU-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg: 2999; GFX10-CU: ; %bb.0: ; %entry 3000; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 3001; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3002; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 3003; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 3004; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 3005; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3006; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 3007; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3008; GFX10-CU-NEXT: ds_write_b32 v0, v1 3009; GFX10-CU-NEXT: s_endpgm 3010; 3011; SKIP-CACHE-INV-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg: 3012; SKIP-CACHE-INV: ; %bb.0: ; %entry 3013; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 3014; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 3015; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 3016; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3017; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 3018; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 3019; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 3020; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3021; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 3022; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3023; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 3024; SKIP-CACHE-INV-NEXT: s_endpgm 3025; 3026; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg: 3027; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 3028; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 3029; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3030; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 3031; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 3032; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 3033; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3034; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 3035; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3036; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 3037; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 3038; 3039; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg: 3040; GFX90A-TGSPLIT: ; %bb.0: ; %entry 3041; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 3042; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3043; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 3044; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 3045; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 3046; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3047; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 3048; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 3049; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3050; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 3051; GFX90A-TGSPLIT-NEXT: s_endpgm 3052 i32 addrspace(3)* %out, i32 %in, i32 %old) { 3053entry: 3054 %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 3055 %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst acquire 3056 %val0 = extractvalue { i32, i1 } %val, 0 3057 store i32 %val0, i32 addrspace(3)* %out, align 4 3058 ret void 3059} 3060 3061define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg( 3062; GFX6-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg: 3063; GFX6: ; %bb.0: ; %entry 3064; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 3065; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 3066; GFX6-NEXT: s_mov_b32 m0, -1 3067; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3068; GFX6-NEXT: v_mov_b32_e32 v0, s2 3069; GFX6-NEXT: v_mov_b32_e32 v1, s1 3070; GFX6-NEXT: v_mov_b32_e32 v2, s0 3071; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3072; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 3073; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3074; GFX6-NEXT: ds_write_b32 v0, v1 3075; GFX6-NEXT: s_endpgm 3076; 3077; GFX7-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg: 3078; GFX7: ; %bb.0: ; %entry 3079; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 3080; GFX7-NEXT: s_mov_b32 m0, -1 3081; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3082; GFX7-NEXT: v_mov_b32_e32 v0, s0 3083; GFX7-NEXT: v_mov_b32_e32 v1, s2 3084; GFX7-NEXT: v_mov_b32_e32 v2, s1 3085; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3086; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 3087; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3088; GFX7-NEXT: ds_write_b32 v0, v1 3089; GFX7-NEXT: s_endpgm 3090; 3091; GFX10-WGP-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg: 3092; GFX10-WGP: ; %bb.0: ; %entry 3093; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 3094; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3095; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 3096; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 3097; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 3098; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3099; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 3100; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 3101; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3102; GFX10-WGP-NEXT: buffer_gl0_inv 3103; GFX10-WGP-NEXT: ds_write_b32 v0, v1 3104; GFX10-WGP-NEXT: s_endpgm 3105; 3106; GFX10-CU-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg: 3107; GFX10-CU: ; %bb.0: ; %entry 3108; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 3109; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3110; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 3111; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 3112; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 3113; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3114; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 3115; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3116; GFX10-CU-NEXT: ds_write_b32 v0, v1 3117; GFX10-CU-NEXT: s_endpgm 3118; 3119; SKIP-CACHE-INV-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg: 3120; SKIP-CACHE-INV: ; %bb.0: ; %entry 3121; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 3122; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 3123; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 3124; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3125; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 3126; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 3127; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 3128; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3129; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 3130; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3131; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 3132; SKIP-CACHE-INV-NEXT: s_endpgm 3133; 3134; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg: 3135; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 3136; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 3137; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3138; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 3139; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 3140; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 3141; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3142; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 3143; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3144; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 3145; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 3146; 3147; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg: 3148; GFX90A-TGSPLIT: ; %bb.0: ; %entry 3149; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 3150; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3151; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 3152; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 3153; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 3154; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3155; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 3156; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 3157; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3158; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 3159; GFX90A-TGSPLIT-NEXT: s_endpgm 3160 i32 addrspace(3)* %out, i32 %in, i32 %old) { 3161entry: 3162 %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 3163 %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst seq_cst 3164 %val0 = extractvalue { i32, i1 } %val, 0 3165 store i32 %val0, i32 addrspace(3)* %out, align 4 3166 ret void 3167} 3168 3169define amdgpu_kernel void @local_workgroup_one_as_unordered_load( 3170; GFX6-LABEL: local_workgroup_one_as_unordered_load: 3171; GFX6: ; %bb.0: ; %entry 3172; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 3173; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 3174; GFX6-NEXT: s_mov_b32 m0, -1 3175; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3176; GFX6-NEXT: v_mov_b32_e32 v0, s0 3177; GFX6-NEXT: ds_read_b32 v0, v0 3178; GFX6-NEXT: v_mov_b32_e32 v1, s1 3179; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3180; GFX6-NEXT: ds_write_b32 v1, v0 3181; GFX6-NEXT: s_endpgm 3182; 3183; GFX7-LABEL: local_workgroup_one_as_unordered_load: 3184; GFX7: ; %bb.0: ; %entry 3185; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3186; GFX7-NEXT: s_mov_b32 m0, -1 3187; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3188; GFX7-NEXT: v_mov_b32_e32 v0, s0 3189; GFX7-NEXT: ds_read_b32 v0, v0 3190; GFX7-NEXT: v_mov_b32_e32 v1, s1 3191; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3192; GFX7-NEXT: ds_write_b32 v1, v0 3193; GFX7-NEXT: s_endpgm 3194; 3195; GFX10-WGP-LABEL: local_workgroup_one_as_unordered_load: 3196; GFX10-WGP: ; %bb.0: ; %entry 3197; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3198; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3199; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 3200; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 3201; GFX10-WGP-NEXT: ds_read_b32 v0, v0 3202; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3203; GFX10-WGP-NEXT: ds_write_b32 v1, v0 3204; GFX10-WGP-NEXT: s_endpgm 3205; 3206; GFX10-CU-LABEL: local_workgroup_one_as_unordered_load: 3207; GFX10-CU: ; %bb.0: ; %entry 3208; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3209; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3210; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 3211; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 3212; GFX10-CU-NEXT: ds_read_b32 v0, v0 3213; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3214; GFX10-CU-NEXT: ds_write_b32 v1, v0 3215; GFX10-CU-NEXT: s_endpgm 3216; 3217; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_unordered_load: 3218; SKIP-CACHE-INV: ; %bb.0: ; %entry 3219; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3220; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 3221; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3222; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 3223; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0 3224; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 3225; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3226; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 3227; SKIP-CACHE-INV-NEXT: s_endpgm 3228; 3229; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_unordered_load: 3230; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 3231; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3232; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3233; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 3234; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 3235; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 3236; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3237; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 3238; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 3239; 3240; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_unordered_load: 3241; GFX90A-TGSPLIT: ; %bb.0: ; %entry 3242; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3243; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3244; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 3245; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 3246; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 3247; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3248; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 3249; GFX90A-TGSPLIT-NEXT: s_endpgm 3250 i32 addrspace(3)* %in, i32 addrspace(3)* %out) { 3251entry: 3252 %val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup-one-as") unordered, align 4 3253 store i32 %val, i32 addrspace(3)* %out 3254 ret void 3255} 3256 3257define amdgpu_kernel void @local_workgroup_one_as_monotonic_load( 3258; GFX6-LABEL: local_workgroup_one_as_monotonic_load: 3259; GFX6: ; %bb.0: ; %entry 3260; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 3261; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 3262; GFX6-NEXT: s_mov_b32 m0, -1 3263; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3264; GFX6-NEXT: v_mov_b32_e32 v0, s0 3265; GFX6-NEXT: ds_read_b32 v0, v0 3266; GFX6-NEXT: v_mov_b32_e32 v1, s1 3267; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3268; GFX6-NEXT: ds_write_b32 v1, v0 3269; GFX6-NEXT: s_endpgm 3270; 3271; GFX7-LABEL: local_workgroup_one_as_monotonic_load: 3272; GFX7: ; %bb.0: ; %entry 3273; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3274; GFX7-NEXT: s_mov_b32 m0, -1 3275; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3276; GFX7-NEXT: v_mov_b32_e32 v0, s0 3277; GFX7-NEXT: ds_read_b32 v0, v0 3278; GFX7-NEXT: v_mov_b32_e32 v1, s1 3279; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3280; GFX7-NEXT: ds_write_b32 v1, v0 3281; GFX7-NEXT: s_endpgm 3282; 3283; GFX10-WGP-LABEL: local_workgroup_one_as_monotonic_load: 3284; GFX10-WGP: ; %bb.0: ; %entry 3285; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3286; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3287; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 3288; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 3289; GFX10-WGP-NEXT: ds_read_b32 v0, v0 3290; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3291; GFX10-WGP-NEXT: ds_write_b32 v1, v0 3292; GFX10-WGP-NEXT: s_endpgm 3293; 3294; GFX10-CU-LABEL: local_workgroup_one_as_monotonic_load: 3295; GFX10-CU: ; %bb.0: ; %entry 3296; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3297; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3298; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 3299; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 3300; GFX10-CU-NEXT: ds_read_b32 v0, v0 3301; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3302; GFX10-CU-NEXT: ds_write_b32 v1, v0 3303; GFX10-CU-NEXT: s_endpgm 3304; 3305; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_monotonic_load: 3306; SKIP-CACHE-INV: ; %bb.0: ; %entry 3307; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3308; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 3309; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3310; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 3311; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0 3312; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 3313; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3314; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 3315; SKIP-CACHE-INV-NEXT: s_endpgm 3316; 3317; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_load: 3318; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 3319; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3320; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3321; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 3322; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 3323; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 3324; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3325; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 3326; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 3327; 3328; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_load: 3329; GFX90A-TGSPLIT: ; %bb.0: ; %entry 3330; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3331; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3332; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 3333; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 3334; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 3335; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3336; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 3337; GFX90A-TGSPLIT-NEXT: s_endpgm 3338 i32 addrspace(3)* %in, i32 addrspace(3)* %out) { 3339entry: 3340 %val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup-one-as") monotonic, align 4 3341 store i32 %val, i32 addrspace(3)* %out 3342 ret void 3343} 3344 3345define amdgpu_kernel void @local_workgroup_one_as_acquire_load( 3346; GFX6-LABEL: local_workgroup_one_as_acquire_load: 3347; GFX6: ; %bb.0: ; %entry 3348; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 3349; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 3350; GFX6-NEXT: s_mov_b32 m0, -1 3351; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3352; GFX6-NEXT: v_mov_b32_e32 v0, s0 3353; GFX6-NEXT: ds_read_b32 v0, v0 3354; GFX6-NEXT: v_mov_b32_e32 v1, s1 3355; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3356; GFX6-NEXT: ds_write_b32 v1, v0 3357; GFX6-NEXT: s_endpgm 3358; 3359; GFX7-LABEL: local_workgroup_one_as_acquire_load: 3360; GFX7: ; %bb.0: ; %entry 3361; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3362; GFX7-NEXT: s_mov_b32 m0, -1 3363; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3364; GFX7-NEXT: v_mov_b32_e32 v0, s0 3365; GFX7-NEXT: ds_read_b32 v0, v0 3366; GFX7-NEXT: v_mov_b32_e32 v1, s1 3367; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3368; GFX7-NEXT: ds_write_b32 v1, v0 3369; GFX7-NEXT: s_endpgm 3370; 3371; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_load: 3372; GFX10-WGP: ; %bb.0: ; %entry 3373; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3374; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3375; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 3376; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 3377; GFX10-WGP-NEXT: ds_read_b32 v0, v0 3378; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3379; GFX10-WGP-NEXT: ds_write_b32 v1, v0 3380; GFX10-WGP-NEXT: s_endpgm 3381; 3382; GFX10-CU-LABEL: local_workgroup_one_as_acquire_load: 3383; GFX10-CU: ; %bb.0: ; %entry 3384; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3385; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3386; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 3387; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 3388; GFX10-CU-NEXT: ds_read_b32 v0, v0 3389; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3390; GFX10-CU-NEXT: ds_write_b32 v1, v0 3391; GFX10-CU-NEXT: s_endpgm 3392; 3393; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acquire_load: 3394; SKIP-CACHE-INV: ; %bb.0: ; %entry 3395; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3396; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 3397; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3398; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 3399; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0 3400; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 3401; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3402; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 3403; SKIP-CACHE-INV-NEXT: s_endpgm 3404; 3405; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_load: 3406; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 3407; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3408; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3409; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 3410; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 3411; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 3412; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3413; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 3414; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 3415; 3416; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_load: 3417; GFX90A-TGSPLIT: ; %bb.0: ; %entry 3418; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3419; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3420; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 3421; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 3422; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 3423; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3424; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 3425; GFX90A-TGSPLIT-NEXT: s_endpgm 3426 i32 addrspace(3)* %in, i32 addrspace(3)* %out) { 3427entry: 3428 %val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup-one-as") acquire, align 4 3429 store i32 %val, i32 addrspace(3)* %out 3430 ret void 3431} 3432 3433define amdgpu_kernel void @local_workgroup_one_as_seq_cst_load( 3434; GFX6-LABEL: local_workgroup_one_as_seq_cst_load: 3435; GFX6: ; %bb.0: ; %entry 3436; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 3437; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 3438; GFX6-NEXT: s_mov_b32 m0, -1 3439; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3440; GFX6-NEXT: v_mov_b32_e32 v0, s0 3441; GFX6-NEXT: ds_read_b32 v0, v0 3442; GFX6-NEXT: v_mov_b32_e32 v1, s1 3443; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3444; GFX6-NEXT: ds_write_b32 v1, v0 3445; GFX6-NEXT: s_endpgm 3446; 3447; GFX7-LABEL: local_workgroup_one_as_seq_cst_load: 3448; GFX7: ; %bb.0: ; %entry 3449; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3450; GFX7-NEXT: s_mov_b32 m0, -1 3451; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3452; GFX7-NEXT: v_mov_b32_e32 v0, s0 3453; GFX7-NEXT: ds_read_b32 v0, v0 3454; GFX7-NEXT: v_mov_b32_e32 v1, s1 3455; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3456; GFX7-NEXT: ds_write_b32 v1, v0 3457; GFX7-NEXT: s_endpgm 3458; 3459; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_load: 3460; GFX10-WGP: ; %bb.0: ; %entry 3461; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3462; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3463; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 3464; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 3465; GFX10-WGP-NEXT: ds_read_b32 v0, v0 3466; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3467; GFX10-WGP-NEXT: ds_write_b32 v1, v0 3468; GFX10-WGP-NEXT: s_endpgm 3469; 3470; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_load: 3471; GFX10-CU: ; %bb.0: ; %entry 3472; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3473; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3474; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 3475; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 3476; GFX10-CU-NEXT: ds_read_b32 v0, v0 3477; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3478; GFX10-CU-NEXT: ds_write_b32 v1, v0 3479; GFX10-CU-NEXT: s_endpgm 3480; 3481; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_load: 3482; SKIP-CACHE-INV: ; %bb.0: ; %entry 3483; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3484; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 3485; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3486; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 3487; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0 3488; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 3489; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3490; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 3491; SKIP-CACHE-INV-NEXT: s_endpgm 3492; 3493; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_load: 3494; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 3495; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3496; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3497; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 3498; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 3499; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 3500; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3501; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 3502; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 3503; 3504; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_load: 3505; GFX90A-TGSPLIT: ; %bb.0: ; %entry 3506; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3507; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3508; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 3509; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 3510; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 3511; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3512; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 3513; GFX90A-TGSPLIT-NEXT: s_endpgm 3514 i32 addrspace(3)* %in, i32 addrspace(3)* %out) { 3515entry: 3516 %val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup-one-as") seq_cst, align 4 3517 store i32 %val, i32 addrspace(3)* %out 3518 ret void 3519} 3520 3521define amdgpu_kernel void @local_workgroup_one_as_unordered_store( 3522; GFX6-LABEL: local_workgroup_one_as_unordered_store: 3523; GFX6: ; %bb.0: ; %entry 3524; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 3525; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 3526; GFX6-NEXT: s_mov_b32 m0, -1 3527; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3528; GFX6-NEXT: v_mov_b32_e32 v1, s0 3529; GFX6-NEXT: v_mov_b32_e32 v0, s1 3530; GFX6-NEXT: ds_write_b32 v0, v1 3531; GFX6-NEXT: s_endpgm 3532; 3533; GFX7-LABEL: local_workgroup_one_as_unordered_store: 3534; GFX7: ; %bb.0: ; %entry 3535; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3536; GFX7-NEXT: s_mov_b32 m0, -1 3537; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3538; GFX7-NEXT: v_mov_b32_e32 v0, s1 3539; GFX7-NEXT: v_mov_b32_e32 v1, s0 3540; GFX7-NEXT: ds_write_b32 v0, v1 3541; GFX7-NEXT: s_endpgm 3542; 3543; GFX10-WGP-LABEL: local_workgroup_one_as_unordered_store: 3544; GFX10-WGP: ; %bb.0: ; %entry 3545; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3546; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3547; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 3548; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 3549; GFX10-WGP-NEXT: ds_write_b32 v0, v1 3550; GFX10-WGP-NEXT: s_endpgm 3551; 3552; GFX10-CU-LABEL: local_workgroup_one_as_unordered_store: 3553; GFX10-CU: ; %bb.0: ; %entry 3554; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3555; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3556; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 3557; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 3558; GFX10-CU-NEXT: ds_write_b32 v0, v1 3559; GFX10-CU-NEXT: s_endpgm 3560; 3561; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_unordered_store: 3562; SKIP-CACHE-INV: ; %bb.0: ; %entry 3563; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3564; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 3565; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3566; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 3567; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 3568; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 3569; SKIP-CACHE-INV-NEXT: s_endpgm 3570; 3571; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_unordered_store: 3572; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 3573; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3574; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3575; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 3576; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 3577; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 3578; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 3579; 3580; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_unordered_store: 3581; GFX90A-TGSPLIT: ; %bb.0: ; %entry 3582; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3583; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3584; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 3585; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 3586; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 3587; GFX90A-TGSPLIT-NEXT: s_endpgm 3588 i32 %in, i32 addrspace(3)* %out) { 3589entry: 3590 store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup-one-as") unordered, align 4 3591 ret void 3592} 3593 3594define amdgpu_kernel void @local_workgroup_one_as_monotonic_store( 3595; GFX6-LABEL: local_workgroup_one_as_monotonic_store: 3596; GFX6: ; %bb.0: ; %entry 3597; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 3598; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 3599; GFX6-NEXT: s_mov_b32 m0, -1 3600; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3601; GFX6-NEXT: v_mov_b32_e32 v1, s0 3602; GFX6-NEXT: v_mov_b32_e32 v0, s1 3603; GFX6-NEXT: ds_write_b32 v0, v1 3604; GFX6-NEXT: s_endpgm 3605; 3606; GFX7-LABEL: local_workgroup_one_as_monotonic_store: 3607; GFX7: ; %bb.0: ; %entry 3608; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3609; GFX7-NEXT: s_mov_b32 m0, -1 3610; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3611; GFX7-NEXT: v_mov_b32_e32 v0, s1 3612; GFX7-NEXT: v_mov_b32_e32 v1, s0 3613; GFX7-NEXT: ds_write_b32 v0, v1 3614; GFX7-NEXT: s_endpgm 3615; 3616; GFX10-WGP-LABEL: local_workgroup_one_as_monotonic_store: 3617; GFX10-WGP: ; %bb.0: ; %entry 3618; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3619; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3620; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 3621; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 3622; GFX10-WGP-NEXT: ds_write_b32 v0, v1 3623; GFX10-WGP-NEXT: s_endpgm 3624; 3625; GFX10-CU-LABEL: local_workgroup_one_as_monotonic_store: 3626; GFX10-CU: ; %bb.0: ; %entry 3627; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3628; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3629; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 3630; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 3631; GFX10-CU-NEXT: ds_write_b32 v0, v1 3632; GFX10-CU-NEXT: s_endpgm 3633; 3634; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_monotonic_store: 3635; SKIP-CACHE-INV: ; %bb.0: ; %entry 3636; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3637; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 3638; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3639; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 3640; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 3641; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 3642; SKIP-CACHE-INV-NEXT: s_endpgm 3643; 3644; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_store: 3645; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 3646; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3647; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3648; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 3649; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 3650; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 3651; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 3652; 3653; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_store: 3654; GFX90A-TGSPLIT: ; %bb.0: ; %entry 3655; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3656; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3657; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 3658; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 3659; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 3660; GFX90A-TGSPLIT-NEXT: s_endpgm 3661 i32 %in, i32 addrspace(3)* %out) { 3662entry: 3663 store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup-one-as") monotonic, align 4 3664 ret void 3665} 3666 3667define amdgpu_kernel void @local_workgroup_one_as_release_store( 3668; GFX6-LABEL: local_workgroup_one_as_release_store: 3669; GFX6: ; %bb.0: ; %entry 3670; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 3671; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 3672; GFX6-NEXT: s_mov_b32 m0, -1 3673; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3674; GFX6-NEXT: v_mov_b32_e32 v1, s0 3675; GFX6-NEXT: v_mov_b32_e32 v0, s1 3676; GFX6-NEXT: ds_write_b32 v0, v1 3677; GFX6-NEXT: s_endpgm 3678; 3679; GFX7-LABEL: local_workgroup_one_as_release_store: 3680; GFX7: ; %bb.0: ; %entry 3681; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3682; GFX7-NEXT: s_mov_b32 m0, -1 3683; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3684; GFX7-NEXT: v_mov_b32_e32 v0, s1 3685; GFX7-NEXT: v_mov_b32_e32 v1, s0 3686; GFX7-NEXT: ds_write_b32 v0, v1 3687; GFX7-NEXT: s_endpgm 3688; 3689; GFX10-WGP-LABEL: local_workgroup_one_as_release_store: 3690; GFX10-WGP: ; %bb.0: ; %entry 3691; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3692; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3693; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 3694; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 3695; GFX10-WGP-NEXT: ds_write_b32 v0, v1 3696; GFX10-WGP-NEXT: s_endpgm 3697; 3698; GFX10-CU-LABEL: local_workgroup_one_as_release_store: 3699; GFX10-CU: ; %bb.0: ; %entry 3700; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3701; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3702; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 3703; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 3704; GFX10-CU-NEXT: ds_write_b32 v0, v1 3705; GFX10-CU-NEXT: s_endpgm 3706; 3707; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_release_store: 3708; SKIP-CACHE-INV: ; %bb.0: ; %entry 3709; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3710; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 3711; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3712; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 3713; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 3714; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 3715; SKIP-CACHE-INV-NEXT: s_endpgm 3716; 3717; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_store: 3718; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 3719; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3720; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3721; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 3722; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 3723; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 3724; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 3725; 3726; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_release_store: 3727; GFX90A-TGSPLIT: ; %bb.0: ; %entry 3728; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3729; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3730; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 3731; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 3732; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 3733; GFX90A-TGSPLIT-NEXT: s_endpgm 3734 i32 %in, i32 addrspace(3)* %out) { 3735entry: 3736 store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup-one-as") release, align 4 3737 ret void 3738} 3739 3740define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store( 3741; GFX6-LABEL: local_workgroup_one_as_seq_cst_store: 3742; GFX6: ; %bb.0: ; %entry 3743; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 3744; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 3745; GFX6-NEXT: s_mov_b32 m0, -1 3746; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3747; GFX6-NEXT: v_mov_b32_e32 v1, s0 3748; GFX6-NEXT: v_mov_b32_e32 v0, s1 3749; GFX6-NEXT: ds_write_b32 v0, v1 3750; GFX6-NEXT: s_endpgm 3751; 3752; GFX7-LABEL: local_workgroup_one_as_seq_cst_store: 3753; GFX7: ; %bb.0: ; %entry 3754; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3755; GFX7-NEXT: s_mov_b32 m0, -1 3756; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3757; GFX7-NEXT: v_mov_b32_e32 v0, s1 3758; GFX7-NEXT: v_mov_b32_e32 v1, s0 3759; GFX7-NEXT: ds_write_b32 v0, v1 3760; GFX7-NEXT: s_endpgm 3761; 3762; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_store: 3763; GFX10-WGP: ; %bb.0: ; %entry 3764; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3765; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3766; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 3767; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 3768; GFX10-WGP-NEXT: ds_write_b32 v0, v1 3769; GFX10-WGP-NEXT: s_endpgm 3770; 3771; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_store: 3772; GFX10-CU: ; %bb.0: ; %entry 3773; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3774; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3775; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 3776; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 3777; GFX10-CU-NEXT: ds_write_b32 v0, v1 3778; GFX10-CU-NEXT: s_endpgm 3779; 3780; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_store: 3781; SKIP-CACHE-INV: ; %bb.0: ; %entry 3782; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3783; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 3784; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3785; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 3786; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 3787; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 3788; SKIP-CACHE-INV-NEXT: s_endpgm 3789; 3790; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_store: 3791; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 3792; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3793; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3794; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 3795; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 3796; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 3797; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 3798; 3799; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_store: 3800; GFX90A-TGSPLIT: ; %bb.0: ; %entry 3801; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3802; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3803; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 3804; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 3805; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 3806; GFX90A-TGSPLIT-NEXT: s_endpgm 3807 i32 %in, i32 addrspace(3)* %out) { 3808entry: 3809 store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup-one-as") seq_cst, align 4 3810 ret void 3811} 3812 3813define amdgpu_kernel void @local_workgroup_one_as_monotonic_atomicrmw( 3814; GFX6-LABEL: local_workgroup_one_as_monotonic_atomicrmw: 3815; GFX6: ; %bb.0: ; %entry 3816; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 3817; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 3818; GFX6-NEXT: s_mov_b32 m0, -1 3819; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3820; GFX6-NEXT: v_mov_b32_e32 v0, s0 3821; GFX6-NEXT: v_mov_b32_e32 v1, s1 3822; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 3823; GFX6-NEXT: s_endpgm 3824; 3825; GFX7-LABEL: local_workgroup_one_as_monotonic_atomicrmw: 3826; GFX7: ; %bb.0: ; %entry 3827; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3828; GFX7-NEXT: s_mov_b32 m0, -1 3829; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3830; GFX7-NEXT: v_mov_b32_e32 v0, s0 3831; GFX7-NEXT: v_mov_b32_e32 v1, s1 3832; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 3833; GFX7-NEXT: s_endpgm 3834; 3835; GFX10-WGP-LABEL: local_workgroup_one_as_monotonic_atomicrmw: 3836; GFX10-WGP: ; %bb.0: ; %entry 3837; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3838; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3839; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 3840; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 3841; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 3842; GFX10-WGP-NEXT: s_endpgm 3843; 3844; GFX10-CU-LABEL: local_workgroup_one_as_monotonic_atomicrmw: 3845; GFX10-CU: ; %bb.0: ; %entry 3846; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3847; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3848; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 3849; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 3850; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 3851; GFX10-CU-NEXT: s_endpgm 3852; 3853; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_monotonic_atomicrmw: 3854; SKIP-CACHE-INV: ; %bb.0: ; %entry 3855; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3856; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 3857; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3858; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 3859; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 3860; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 3861; SKIP-CACHE-INV-NEXT: s_endpgm 3862; 3863; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_atomicrmw: 3864; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 3865; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3866; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3867; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 3868; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 3869; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 3870; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 3871; 3872; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_atomicrmw: 3873; GFX90A-TGSPLIT: ; %bb.0: ; %entry 3874; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3875; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3876; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 3877; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 3878; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 3879; GFX90A-TGSPLIT-NEXT: s_endpgm 3880 i32 addrspace(3)* %out, i32 %in) { 3881entry: 3882 %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") monotonic 3883 ret void 3884} 3885 3886define amdgpu_kernel void @local_workgroup_one_as_acquire_atomicrmw( 3887; GFX6-LABEL: local_workgroup_one_as_acquire_atomicrmw: 3888; GFX6: ; %bb.0: ; %entry 3889; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 3890; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 3891; GFX6-NEXT: s_mov_b32 m0, -1 3892; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3893; GFX6-NEXT: v_mov_b32_e32 v0, s0 3894; GFX6-NEXT: v_mov_b32_e32 v1, s1 3895; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 3896; GFX6-NEXT: s_endpgm 3897; 3898; GFX7-LABEL: local_workgroup_one_as_acquire_atomicrmw: 3899; GFX7: ; %bb.0: ; %entry 3900; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3901; GFX7-NEXT: s_mov_b32 m0, -1 3902; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3903; GFX7-NEXT: v_mov_b32_e32 v0, s0 3904; GFX7-NEXT: v_mov_b32_e32 v1, s1 3905; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 3906; GFX7-NEXT: s_endpgm 3907; 3908; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_atomicrmw: 3909; GFX10-WGP: ; %bb.0: ; %entry 3910; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3911; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3912; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 3913; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 3914; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 3915; GFX10-WGP-NEXT: s_endpgm 3916; 3917; GFX10-CU-LABEL: local_workgroup_one_as_acquire_atomicrmw: 3918; GFX10-CU: ; %bb.0: ; %entry 3919; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3920; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3921; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 3922; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 3923; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 3924; GFX10-CU-NEXT: s_endpgm 3925; 3926; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acquire_atomicrmw: 3927; SKIP-CACHE-INV: ; %bb.0: ; %entry 3928; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3929; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 3930; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3931; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 3932; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 3933; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 3934; SKIP-CACHE-INV-NEXT: s_endpgm 3935; 3936; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_atomicrmw: 3937; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 3938; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3939; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3940; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 3941; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 3942; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 3943; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 3944; 3945; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_atomicrmw: 3946; GFX90A-TGSPLIT: ; %bb.0: ; %entry 3947; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3948; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3949; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 3950; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 3951; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 3952; GFX90A-TGSPLIT-NEXT: s_endpgm 3953 i32 addrspace(3)* %out, i32 %in) { 3954entry: 3955 %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") acquire 3956 ret void 3957} 3958 3959define amdgpu_kernel void @local_workgroup_one_as_release_atomicrmw( 3960; GFX6-LABEL: local_workgroup_one_as_release_atomicrmw: 3961; GFX6: ; %bb.0: ; %entry 3962; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 3963; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 3964; GFX6-NEXT: s_mov_b32 m0, -1 3965; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3966; GFX6-NEXT: v_mov_b32_e32 v0, s0 3967; GFX6-NEXT: v_mov_b32_e32 v1, s1 3968; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 3969; GFX6-NEXT: s_endpgm 3970; 3971; GFX7-LABEL: local_workgroup_one_as_release_atomicrmw: 3972; GFX7: ; %bb.0: ; %entry 3973; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3974; GFX7-NEXT: s_mov_b32 m0, -1 3975; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3976; GFX7-NEXT: v_mov_b32_e32 v0, s0 3977; GFX7-NEXT: v_mov_b32_e32 v1, s1 3978; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 3979; GFX7-NEXT: s_endpgm 3980; 3981; GFX10-WGP-LABEL: local_workgroup_one_as_release_atomicrmw: 3982; GFX10-WGP: ; %bb.0: ; %entry 3983; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3984; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3985; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 3986; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 3987; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 3988; GFX10-WGP-NEXT: s_endpgm 3989; 3990; GFX10-CU-LABEL: local_workgroup_one_as_release_atomicrmw: 3991; GFX10-CU: ; %bb.0: ; %entry 3992; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3993; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3994; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 3995; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 3996; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 3997; GFX10-CU-NEXT: s_endpgm 3998; 3999; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_release_atomicrmw: 4000; SKIP-CACHE-INV: ; %bb.0: ; %entry 4001; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4002; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 4003; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4004; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 4005; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 4006; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 4007; SKIP-CACHE-INV-NEXT: s_endpgm 4008; 4009; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_atomicrmw: 4010; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 4011; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4012; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4013; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 4014; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 4015; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 4016; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 4017; 4018; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_release_atomicrmw: 4019; GFX90A-TGSPLIT: ; %bb.0: ; %entry 4020; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4021; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4022; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 4023; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 4024; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 4025; GFX90A-TGSPLIT-NEXT: s_endpgm 4026 i32 addrspace(3)* %out, i32 %in) { 4027entry: 4028 %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") release 4029 ret void 4030} 4031 4032define amdgpu_kernel void @local_workgroup_one_as_acq_rel_atomicrmw( 4033; GFX6-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: 4034; GFX6: ; %bb.0: ; %entry 4035; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 4036; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 4037; GFX6-NEXT: s_mov_b32 m0, -1 4038; GFX6-NEXT: s_waitcnt lgkmcnt(0) 4039; GFX6-NEXT: v_mov_b32_e32 v0, s0 4040; GFX6-NEXT: v_mov_b32_e32 v1, s1 4041; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 4042; GFX6-NEXT: s_endpgm 4043; 4044; GFX7-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: 4045; GFX7: ; %bb.0: ; %entry 4046; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4047; GFX7-NEXT: s_mov_b32 m0, -1 4048; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4049; GFX7-NEXT: v_mov_b32_e32 v0, s0 4050; GFX7-NEXT: v_mov_b32_e32 v1, s1 4051; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 4052; GFX7-NEXT: s_endpgm 4053; 4054; GFX10-WGP-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: 4055; GFX10-WGP: ; %bb.0: ; %entry 4056; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4057; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4058; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4059; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 4060; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 4061; GFX10-WGP-NEXT: s_endpgm 4062; 4063; GFX10-CU-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: 4064; GFX10-CU: ; %bb.0: ; %entry 4065; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4066; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4067; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 4068; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 4069; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 4070; GFX10-CU-NEXT: s_endpgm 4071; 4072; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: 4073; SKIP-CACHE-INV: ; %bb.0: ; %entry 4074; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4075; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 4076; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4077; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 4078; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 4079; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 4080; SKIP-CACHE-INV-NEXT: s_endpgm 4081; 4082; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: 4083; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 4084; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4085; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4086; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 4087; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 4088; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 4089; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 4090; 4091; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: 4092; GFX90A-TGSPLIT: ; %bb.0: ; %entry 4093; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4094; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4095; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 4096; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 4097; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 4098; GFX90A-TGSPLIT-NEXT: s_endpgm 4099 i32 addrspace(3)* %out, i32 %in) { 4100entry: 4101 %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") acq_rel 4102 ret void 4103} 4104 4105define amdgpu_kernel void @local_workgroup_one_as_seq_cst_atomicrmw( 4106; GFX6-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: 4107; GFX6: ; %bb.0: ; %entry 4108; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 4109; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 4110; GFX6-NEXT: s_mov_b32 m0, -1 4111; GFX6-NEXT: s_waitcnt lgkmcnt(0) 4112; GFX6-NEXT: v_mov_b32_e32 v0, s0 4113; GFX6-NEXT: v_mov_b32_e32 v1, s1 4114; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 4115; GFX6-NEXT: s_endpgm 4116; 4117; GFX7-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: 4118; GFX7: ; %bb.0: ; %entry 4119; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4120; GFX7-NEXT: s_mov_b32 m0, -1 4121; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4122; GFX7-NEXT: v_mov_b32_e32 v0, s0 4123; GFX7-NEXT: v_mov_b32_e32 v1, s1 4124; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 4125; GFX7-NEXT: s_endpgm 4126; 4127; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: 4128; GFX10-WGP: ; %bb.0: ; %entry 4129; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4130; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4131; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4132; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 4133; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 4134; GFX10-WGP-NEXT: s_endpgm 4135; 4136; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: 4137; GFX10-CU: ; %bb.0: ; %entry 4138; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4139; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4140; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 4141; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 4142; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 4143; GFX10-CU-NEXT: s_endpgm 4144; 4145; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: 4146; SKIP-CACHE-INV: ; %bb.0: ; %entry 4147; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4148; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 4149; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4150; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 4151; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 4152; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 4153; SKIP-CACHE-INV-NEXT: s_endpgm 4154; 4155; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: 4156; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 4157; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4158; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4159; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 4160; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 4161; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 4162; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 4163; 4164; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: 4165; GFX90A-TGSPLIT: ; %bb.0: ; %entry 4166; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4167; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4168; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 4169; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 4170; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 4171; GFX90A-TGSPLIT-NEXT: s_endpgm 4172 i32 addrspace(3)* %out, i32 %in) { 4173entry: 4174 %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") seq_cst 4175 ret void 4176} 4177 4178define amdgpu_kernel void @local_workgroup_one_as_acquire_ret_atomicrmw( 4179; GFX6-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw: 4180; GFX6: ; %bb.0: ; %entry 4181; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 4182; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 4183; GFX6-NEXT: s_mov_b32 m0, -1 4184; GFX6-NEXT: s_waitcnt lgkmcnt(0) 4185; GFX6-NEXT: v_mov_b32_e32 v0, s0 4186; GFX6-NEXT: v_mov_b32_e32 v1, s1 4187; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 4188; GFX6-NEXT: s_waitcnt lgkmcnt(0) 4189; GFX6-NEXT: ds_write_b32 v0, v1 4190; GFX6-NEXT: s_endpgm 4191; 4192; GFX7-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw: 4193; GFX7: ; %bb.0: ; %entry 4194; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4195; GFX7-NEXT: s_mov_b32 m0, -1 4196; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4197; GFX7-NEXT: v_mov_b32_e32 v0, s0 4198; GFX7-NEXT: v_mov_b32_e32 v1, s1 4199; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 4200; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4201; GFX7-NEXT: ds_write_b32 v0, v1 4202; GFX7-NEXT: s_endpgm 4203; 4204; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw: 4205; GFX10-WGP: ; %bb.0: ; %entry 4206; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4207; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4208; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4209; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 4210; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 4211; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4212; GFX10-WGP-NEXT: ds_write_b32 v0, v1 4213; GFX10-WGP-NEXT: s_endpgm 4214; 4215; GFX10-CU-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw: 4216; GFX10-CU: ; %bb.0: ; %entry 4217; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4218; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4219; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 4220; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 4221; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 4222; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4223; GFX10-CU-NEXT: ds_write_b32 v0, v1 4224; GFX10-CU-NEXT: s_endpgm 4225; 4226; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw: 4227; SKIP-CACHE-INV: ; %bb.0: ; %entry 4228; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4229; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 4230; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4231; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 4232; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 4233; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 4234; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4235; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 4236; SKIP-CACHE-INV-NEXT: s_endpgm 4237; 4238; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw: 4239; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 4240; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4241; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4242; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 4243; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 4244; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 4245; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4246; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 4247; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 4248; 4249; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw: 4250; GFX90A-TGSPLIT: ; %bb.0: ; %entry 4251; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4252; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4253; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 4254; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 4255; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 4256; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4257; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 4258; GFX90A-TGSPLIT-NEXT: s_endpgm 4259 i32 addrspace(3)* %out, i32 %in) { 4260entry: 4261 %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") acquire 4262 store i32 %val, i32 addrspace(3)* %out, align 4 4263 ret void 4264} 4265 4266define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw( 4267; GFX6-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw: 4268; GFX6: ; %bb.0: ; %entry 4269; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 4270; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 4271; GFX6-NEXT: s_mov_b32 m0, -1 4272; GFX6-NEXT: s_waitcnt lgkmcnt(0) 4273; GFX6-NEXT: v_mov_b32_e32 v0, s0 4274; GFX6-NEXT: v_mov_b32_e32 v1, s1 4275; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 4276; GFX6-NEXT: s_waitcnt lgkmcnt(0) 4277; GFX6-NEXT: ds_write_b32 v0, v1 4278; GFX6-NEXT: s_endpgm 4279; 4280; GFX7-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw: 4281; GFX7: ; %bb.0: ; %entry 4282; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4283; GFX7-NEXT: s_mov_b32 m0, -1 4284; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4285; GFX7-NEXT: v_mov_b32_e32 v0, s0 4286; GFX7-NEXT: v_mov_b32_e32 v1, s1 4287; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 4288; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4289; GFX7-NEXT: ds_write_b32 v0, v1 4290; GFX7-NEXT: s_endpgm 4291; 4292; GFX10-WGP-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw: 4293; GFX10-WGP: ; %bb.0: ; %entry 4294; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4295; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4296; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4297; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 4298; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 4299; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4300; GFX10-WGP-NEXT: ds_write_b32 v0, v1 4301; GFX10-WGP-NEXT: s_endpgm 4302; 4303; GFX10-CU-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw: 4304; GFX10-CU: ; %bb.0: ; %entry 4305; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4306; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4307; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 4308; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 4309; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 4310; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4311; GFX10-CU-NEXT: ds_write_b32 v0, v1 4312; GFX10-CU-NEXT: s_endpgm 4313; 4314; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw: 4315; SKIP-CACHE-INV: ; %bb.0: ; %entry 4316; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4317; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 4318; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4319; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 4320; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 4321; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 4322; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4323; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 4324; SKIP-CACHE-INV-NEXT: s_endpgm 4325; 4326; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw: 4327; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 4328; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4329; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4330; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 4331; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 4332; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 4333; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4334; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 4335; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 4336; 4337; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw: 4338; GFX90A-TGSPLIT: ; %bb.0: ; %entry 4339; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4340; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4341; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 4342; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 4343; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 4344; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4345; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 4346; GFX90A-TGSPLIT-NEXT: s_endpgm 4347 i32 addrspace(3)* %out, i32 %in) { 4348entry: 4349 %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") acq_rel 4350 store i32 %val, i32 addrspace(3)* %out, align 4 4351 ret void 4352} 4353 4354define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw( 4355; GFX6-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw: 4356; GFX6: ; %bb.0: ; %entry 4357; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 4358; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 4359; GFX6-NEXT: s_mov_b32 m0, -1 4360; GFX6-NEXT: s_waitcnt lgkmcnt(0) 4361; GFX6-NEXT: v_mov_b32_e32 v0, s0 4362; GFX6-NEXT: v_mov_b32_e32 v1, s1 4363; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 4364; GFX6-NEXT: s_waitcnt lgkmcnt(0) 4365; GFX6-NEXT: ds_write_b32 v0, v1 4366; GFX6-NEXT: s_endpgm 4367; 4368; GFX7-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw: 4369; GFX7: ; %bb.0: ; %entry 4370; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4371; GFX7-NEXT: s_mov_b32 m0, -1 4372; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4373; GFX7-NEXT: v_mov_b32_e32 v0, s0 4374; GFX7-NEXT: v_mov_b32_e32 v1, s1 4375; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 4376; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4377; GFX7-NEXT: ds_write_b32 v0, v1 4378; GFX7-NEXT: s_endpgm 4379; 4380; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw: 4381; GFX10-WGP: ; %bb.0: ; %entry 4382; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4383; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4384; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4385; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 4386; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 4387; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4388; GFX10-WGP-NEXT: ds_write_b32 v0, v1 4389; GFX10-WGP-NEXT: s_endpgm 4390; 4391; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw: 4392; GFX10-CU: ; %bb.0: ; %entry 4393; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4394; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4395; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 4396; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 4397; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 4398; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4399; GFX10-CU-NEXT: ds_write_b32 v0, v1 4400; GFX10-CU-NEXT: s_endpgm 4401; 4402; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw: 4403; SKIP-CACHE-INV: ; %bb.0: ; %entry 4404; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4405; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 4406; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4407; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 4408; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 4409; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 4410; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4411; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 4412; SKIP-CACHE-INV-NEXT: s_endpgm 4413; 4414; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw: 4415; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 4416; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4417; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4418; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 4419; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 4420; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 4421; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4422; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 4423; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 4424; 4425; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw: 4426; GFX90A-TGSPLIT: ; %bb.0: ; %entry 4427; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4428; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4429; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 4430; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 4431; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 4432; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4433; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 4434; GFX90A-TGSPLIT-NEXT: s_endpgm 4435 i32 addrspace(3)* %out, i32 %in) { 4436entry: 4437 %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") seq_cst 4438 store i32 %val, i32 addrspace(3)* %out, align 4 4439 ret void 4440} 4441 4442define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_cmpxchg( 4443; GFX6-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg: 4444; GFX6: ; %bb.0: ; %entry 4445; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 4446; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 4447; GFX6-NEXT: s_mov_b32 m0, -1 4448; GFX6-NEXT: s_waitcnt lgkmcnt(0) 4449; GFX6-NEXT: v_mov_b32_e32 v0, s2 4450; GFX6-NEXT: v_mov_b32_e32 v1, s1 4451; GFX6-NEXT: v_mov_b32_e32 v2, s0 4452; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 4453; GFX6-NEXT: s_endpgm 4454; 4455; GFX7-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg: 4456; GFX7: ; %bb.0: ; %entry 4457; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 4458; GFX7-NEXT: s_mov_b32 m0, -1 4459; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4460; GFX7-NEXT: v_mov_b32_e32 v0, s0 4461; GFX7-NEXT: v_mov_b32_e32 v1, s2 4462; GFX7-NEXT: v_mov_b32_e32 v2, s1 4463; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 4464; GFX7-NEXT: s_endpgm 4465; 4466; GFX10-WGP-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg: 4467; GFX10-WGP: ; %bb.0: ; %entry 4468; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 4469; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4470; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4471; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 4472; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 4473; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 4474; GFX10-WGP-NEXT: s_endpgm 4475; 4476; GFX10-CU-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg: 4477; GFX10-CU: ; %bb.0: ; %entry 4478; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 4479; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4480; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 4481; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 4482; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 4483; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 4484; GFX10-CU-NEXT: s_endpgm 4485; 4486; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg: 4487; SKIP-CACHE-INV: ; %bb.0: ; %entry 4488; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 4489; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 4490; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 4491; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4492; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 4493; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 4494; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 4495; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 4496; SKIP-CACHE-INV-NEXT: s_endpgm 4497; 4498; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg: 4499; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 4500; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 4501; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4502; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 4503; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 4504; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 4505; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 4506; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 4507; 4508; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg: 4509; GFX90A-TGSPLIT: ; %bb.0: ; %entry 4510; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 4511; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4512; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 4513; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 4514; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 4515; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 4516; GFX90A-TGSPLIT-NEXT: s_endpgm 4517 i32 addrspace(3)* %out, i32 %in, i32 %old) { 4518entry: 4519 %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 4520 %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic monotonic 4521 ret void 4522} 4523 4524define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_cmpxchg( 4525; GFX6-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: 4526; GFX6: ; %bb.0: ; %entry 4527; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 4528; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 4529; GFX6-NEXT: s_mov_b32 m0, -1 4530; GFX6-NEXT: s_waitcnt lgkmcnt(0) 4531; GFX6-NEXT: v_mov_b32_e32 v0, s2 4532; GFX6-NEXT: v_mov_b32_e32 v1, s1 4533; GFX6-NEXT: v_mov_b32_e32 v2, s0 4534; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 4535; GFX6-NEXT: s_endpgm 4536; 4537; GFX7-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: 4538; GFX7: ; %bb.0: ; %entry 4539; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 4540; GFX7-NEXT: s_mov_b32 m0, -1 4541; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4542; GFX7-NEXT: v_mov_b32_e32 v0, s0 4543; GFX7-NEXT: v_mov_b32_e32 v1, s2 4544; GFX7-NEXT: v_mov_b32_e32 v2, s1 4545; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 4546; GFX7-NEXT: s_endpgm 4547; 4548; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: 4549; GFX10-WGP: ; %bb.0: ; %entry 4550; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 4551; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4552; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4553; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 4554; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 4555; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 4556; GFX10-WGP-NEXT: s_endpgm 4557; 4558; GFX10-CU-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: 4559; GFX10-CU: ; %bb.0: ; %entry 4560; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 4561; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4562; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 4563; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 4564; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 4565; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 4566; GFX10-CU-NEXT: s_endpgm 4567; 4568; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: 4569; SKIP-CACHE-INV: ; %bb.0: ; %entry 4570; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 4571; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 4572; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 4573; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4574; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 4575; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 4576; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 4577; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 4578; SKIP-CACHE-INV-NEXT: s_endpgm 4579; 4580; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: 4581; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 4582; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 4583; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4584; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 4585; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 4586; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 4587; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 4588; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 4589; 4590; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: 4591; GFX90A-TGSPLIT: ; %bb.0: ; %entry 4592; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 4593; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4594; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 4595; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 4596; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 4597; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 4598; GFX90A-TGSPLIT-NEXT: s_endpgm 4599 i32 addrspace(3)* %out, i32 %in, i32 %old) { 4600entry: 4601 %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 4602 %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire monotonic 4603 ret void 4604} 4605 4606define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg( 4607; GFX6-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg: 4608; GFX6: ; %bb.0: ; %entry 4609; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 4610; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 4611; GFX6-NEXT: s_mov_b32 m0, -1 4612; GFX6-NEXT: s_waitcnt lgkmcnt(0) 4613; GFX6-NEXT: v_mov_b32_e32 v0, s2 4614; GFX6-NEXT: v_mov_b32_e32 v1, s1 4615; GFX6-NEXT: v_mov_b32_e32 v2, s0 4616; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 4617; GFX6-NEXT: s_endpgm 4618; 4619; GFX7-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg: 4620; GFX7: ; %bb.0: ; %entry 4621; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 4622; GFX7-NEXT: s_mov_b32 m0, -1 4623; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4624; GFX7-NEXT: v_mov_b32_e32 v0, s0 4625; GFX7-NEXT: v_mov_b32_e32 v1, s2 4626; GFX7-NEXT: v_mov_b32_e32 v2, s1 4627; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 4628; GFX7-NEXT: s_endpgm 4629; 4630; GFX10-WGP-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg: 4631; GFX10-WGP: ; %bb.0: ; %entry 4632; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 4633; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4634; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4635; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 4636; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 4637; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 4638; GFX10-WGP-NEXT: s_endpgm 4639; 4640; GFX10-CU-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg: 4641; GFX10-CU: ; %bb.0: ; %entry 4642; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 4643; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4644; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 4645; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 4646; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 4647; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 4648; GFX10-CU-NEXT: s_endpgm 4649; 4650; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg: 4651; SKIP-CACHE-INV: ; %bb.0: ; %entry 4652; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 4653; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 4654; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 4655; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4656; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 4657; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 4658; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 4659; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 4660; SKIP-CACHE-INV-NEXT: s_endpgm 4661; 4662; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg: 4663; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 4664; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 4665; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4666; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 4667; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 4668; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 4669; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 4670; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 4671; 4672; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg: 4673; GFX90A-TGSPLIT: ; %bb.0: ; %entry 4674; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 4675; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4676; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 4677; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 4678; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 4679; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 4680; GFX90A-TGSPLIT-NEXT: s_endpgm 4681 i32 addrspace(3)* %out, i32 %in, i32 %old) { 4682entry: 4683 %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 4684 %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release monotonic 4685 ret void 4686} 4687 4688define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg( 4689; GFX6-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: 4690; GFX6: ; %bb.0: ; %entry 4691; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 4692; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 4693; GFX6-NEXT: s_mov_b32 m0, -1 4694; GFX6-NEXT: s_waitcnt lgkmcnt(0) 4695; GFX6-NEXT: v_mov_b32_e32 v0, s2 4696; GFX6-NEXT: v_mov_b32_e32 v1, s1 4697; GFX6-NEXT: v_mov_b32_e32 v2, s0 4698; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 4699; GFX6-NEXT: s_endpgm 4700; 4701; GFX7-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: 4702; GFX7: ; %bb.0: ; %entry 4703; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 4704; GFX7-NEXT: s_mov_b32 m0, -1 4705; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4706; GFX7-NEXT: v_mov_b32_e32 v0, s0 4707; GFX7-NEXT: v_mov_b32_e32 v1, s2 4708; GFX7-NEXT: v_mov_b32_e32 v2, s1 4709; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 4710; GFX7-NEXT: s_endpgm 4711; 4712; GFX10-WGP-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: 4713; GFX10-WGP: ; %bb.0: ; %entry 4714; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 4715; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4716; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4717; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 4718; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 4719; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 4720; GFX10-WGP-NEXT: s_endpgm 4721; 4722; GFX10-CU-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: 4723; GFX10-CU: ; %bb.0: ; %entry 4724; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 4725; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4726; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 4727; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 4728; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 4729; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 4730; GFX10-CU-NEXT: s_endpgm 4731; 4732; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: 4733; SKIP-CACHE-INV: ; %bb.0: ; %entry 4734; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 4735; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 4736; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 4737; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4738; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 4739; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 4740; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 4741; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 4742; SKIP-CACHE-INV-NEXT: s_endpgm 4743; 4744; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: 4745; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 4746; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 4747; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4748; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 4749; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 4750; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 4751; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 4752; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 4753; 4754; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: 4755; GFX90A-TGSPLIT: ; %bb.0: ; %entry 4756; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 4757; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4758; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 4759; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 4760; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 4761; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 4762; GFX90A-TGSPLIT-NEXT: s_endpgm 4763 i32 addrspace(3)* %out, i32 %in, i32 %old) { 4764entry: 4765 %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 4766 %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel monotonic 4767 ret void 4768} 4769 4770define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg( 4771; GFX6-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: 4772; GFX6: ; %bb.0: ; %entry 4773; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 4774; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 4775; GFX6-NEXT: s_mov_b32 m0, -1 4776; GFX6-NEXT: s_waitcnt lgkmcnt(0) 4777; GFX6-NEXT: v_mov_b32_e32 v0, s2 4778; GFX6-NEXT: v_mov_b32_e32 v1, s1 4779; GFX6-NEXT: v_mov_b32_e32 v2, s0 4780; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 4781; GFX6-NEXT: s_endpgm 4782; 4783; GFX7-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: 4784; GFX7: ; %bb.0: ; %entry 4785; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 4786; GFX7-NEXT: s_mov_b32 m0, -1 4787; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4788; GFX7-NEXT: v_mov_b32_e32 v0, s0 4789; GFX7-NEXT: v_mov_b32_e32 v1, s2 4790; GFX7-NEXT: v_mov_b32_e32 v2, s1 4791; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 4792; GFX7-NEXT: s_endpgm 4793; 4794; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: 4795; GFX10-WGP: ; %bb.0: ; %entry 4796; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 4797; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4798; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4799; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 4800; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 4801; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 4802; GFX10-WGP-NEXT: s_endpgm 4803; 4804; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: 4805; GFX10-CU: ; %bb.0: ; %entry 4806; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 4807; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4808; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 4809; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 4810; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 4811; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 4812; GFX10-CU-NEXT: s_endpgm 4813; 4814; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: 4815; SKIP-CACHE-INV: ; %bb.0: ; %entry 4816; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 4817; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 4818; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 4819; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4820; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 4821; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 4822; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 4823; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 4824; SKIP-CACHE-INV-NEXT: s_endpgm 4825; 4826; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: 4827; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 4828; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 4829; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4830; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 4831; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 4832; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 4833; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 4834; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 4835; 4836; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: 4837; GFX90A-TGSPLIT: ; %bb.0: ; %entry 4838; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 4839; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4840; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 4841; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 4842; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 4843; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 4844; GFX90A-TGSPLIT-NEXT: s_endpgm 4845 i32 addrspace(3)* %out, i32 %in, i32 %old) { 4846entry: 4847 %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 4848 %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst monotonic 4849 ret void 4850} 4851 4852define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_cmpxchg( 4853; GFX6-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: 4854; GFX6: ; %bb.0: ; %entry 4855; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 4856; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 4857; GFX6-NEXT: s_mov_b32 m0, -1 4858; GFX6-NEXT: s_waitcnt lgkmcnt(0) 4859; GFX6-NEXT: v_mov_b32_e32 v0, s2 4860; GFX6-NEXT: v_mov_b32_e32 v1, s1 4861; GFX6-NEXT: v_mov_b32_e32 v2, s0 4862; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 4863; GFX6-NEXT: s_endpgm 4864; 4865; GFX7-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: 4866; GFX7: ; %bb.0: ; %entry 4867; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 4868; GFX7-NEXT: s_mov_b32 m0, -1 4869; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4870; GFX7-NEXT: v_mov_b32_e32 v0, s0 4871; GFX7-NEXT: v_mov_b32_e32 v1, s2 4872; GFX7-NEXT: v_mov_b32_e32 v2, s1 4873; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 4874; GFX7-NEXT: s_endpgm 4875; 4876; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: 4877; GFX10-WGP: ; %bb.0: ; %entry 4878; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 4879; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4880; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4881; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 4882; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 4883; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 4884; GFX10-WGP-NEXT: s_endpgm 4885; 4886; GFX10-CU-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: 4887; GFX10-CU: ; %bb.0: ; %entry 4888; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 4889; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4890; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 4891; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 4892; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 4893; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 4894; GFX10-CU-NEXT: s_endpgm 4895; 4896; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: 4897; SKIP-CACHE-INV: ; %bb.0: ; %entry 4898; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 4899; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 4900; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 4901; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4902; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 4903; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 4904; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 4905; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 4906; SKIP-CACHE-INV-NEXT: s_endpgm 4907; 4908; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: 4909; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 4910; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 4911; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4912; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 4913; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 4914; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 4915; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 4916; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 4917; 4918; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: 4919; GFX90A-TGSPLIT: ; %bb.0: ; %entry 4920; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 4921; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4922; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 4923; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 4924; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 4925; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 4926; GFX90A-TGSPLIT-NEXT: s_endpgm 4927 i32 addrspace(3)* %out, i32 %in, i32 %old) { 4928entry: 4929 %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 4930 %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire acquire 4931 ret void 4932} 4933 4934define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg( 4935; GFX6-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: 4936; GFX6: ; %bb.0: ; %entry 4937; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 4938; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 4939; GFX6-NEXT: s_mov_b32 m0, -1 4940; GFX6-NEXT: s_waitcnt lgkmcnt(0) 4941; GFX6-NEXT: v_mov_b32_e32 v0, s2 4942; GFX6-NEXT: v_mov_b32_e32 v1, s1 4943; GFX6-NEXT: v_mov_b32_e32 v2, s0 4944; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 4945; GFX6-NEXT: s_endpgm 4946; 4947; GFX7-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: 4948; GFX7: ; %bb.0: ; %entry 4949; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 4950; GFX7-NEXT: s_mov_b32 m0, -1 4951; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4952; GFX7-NEXT: v_mov_b32_e32 v0, s0 4953; GFX7-NEXT: v_mov_b32_e32 v1, s2 4954; GFX7-NEXT: v_mov_b32_e32 v2, s1 4955; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 4956; GFX7-NEXT: s_endpgm 4957; 4958; GFX10-WGP-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: 4959; GFX10-WGP: ; %bb.0: ; %entry 4960; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 4961; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4962; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4963; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 4964; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 4965; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 4966; GFX10-WGP-NEXT: s_endpgm 4967; 4968; GFX10-CU-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: 4969; GFX10-CU: ; %bb.0: ; %entry 4970; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 4971; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4972; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 4973; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 4974; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 4975; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 4976; GFX10-CU-NEXT: s_endpgm 4977; 4978; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: 4979; SKIP-CACHE-INV: ; %bb.0: ; %entry 4980; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 4981; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 4982; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 4983; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4984; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 4985; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 4986; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 4987; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 4988; SKIP-CACHE-INV-NEXT: s_endpgm 4989; 4990; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: 4991; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 4992; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 4993; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4994; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 4995; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 4996; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 4997; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 4998; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 4999; 5000; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: 5001; GFX90A-TGSPLIT: ; %bb.0: ; %entry 5002; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 5003; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5004; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 5005; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 5006; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 5007; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 5008; GFX90A-TGSPLIT-NEXT: s_endpgm 5009 i32 addrspace(3)* %out, i32 %in, i32 %old) { 5010entry: 5011 %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 5012 %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release acquire 5013 ret void 5014} 5015 5016define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg( 5017; GFX6-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: 5018; GFX6: ; %bb.0: ; %entry 5019; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 5020; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 5021; GFX6-NEXT: s_mov_b32 m0, -1 5022; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5023; GFX6-NEXT: v_mov_b32_e32 v0, s2 5024; GFX6-NEXT: v_mov_b32_e32 v1, s1 5025; GFX6-NEXT: v_mov_b32_e32 v2, s0 5026; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 5027; GFX6-NEXT: s_endpgm 5028; 5029; GFX7-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: 5030; GFX7: ; %bb.0: ; %entry 5031; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 5032; GFX7-NEXT: s_mov_b32 m0, -1 5033; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5034; GFX7-NEXT: v_mov_b32_e32 v0, s0 5035; GFX7-NEXT: v_mov_b32_e32 v1, s2 5036; GFX7-NEXT: v_mov_b32_e32 v2, s1 5037; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 5038; GFX7-NEXT: s_endpgm 5039; 5040; GFX10-WGP-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: 5041; GFX10-WGP: ; %bb.0: ; %entry 5042; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 5043; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 5044; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 5045; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 5046; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 5047; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 5048; GFX10-WGP-NEXT: s_endpgm 5049; 5050; GFX10-CU-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: 5051; GFX10-CU: ; %bb.0: ; %entry 5052; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 5053; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5054; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 5055; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 5056; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 5057; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 5058; GFX10-CU-NEXT: s_endpgm 5059; 5060; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: 5061; SKIP-CACHE-INV: ; %bb.0: ; %entry 5062; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 5063; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 5064; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 5065; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5066; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 5067; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 5068; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 5069; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 5070; SKIP-CACHE-INV-NEXT: s_endpgm 5071; 5072; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: 5073; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 5074; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 5075; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5076; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 5077; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 5078; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 5079; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 5080; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 5081; 5082; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: 5083; GFX90A-TGSPLIT: ; %bb.0: ; %entry 5084; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 5085; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5086; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 5087; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 5088; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 5089; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 5090; GFX90A-TGSPLIT-NEXT: s_endpgm 5091 i32 addrspace(3)* %out, i32 %in, i32 %old) { 5092entry: 5093 %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 5094 %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel acquire 5095 ret void 5096} 5097 5098define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg( 5099; GFX6-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: 5100; GFX6: ; %bb.0: ; %entry 5101; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 5102; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 5103; GFX6-NEXT: s_mov_b32 m0, -1 5104; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5105; GFX6-NEXT: v_mov_b32_e32 v0, s2 5106; GFX6-NEXT: v_mov_b32_e32 v1, s1 5107; GFX6-NEXT: v_mov_b32_e32 v2, s0 5108; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 5109; GFX6-NEXT: s_endpgm 5110; 5111; GFX7-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: 5112; GFX7: ; %bb.0: ; %entry 5113; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 5114; GFX7-NEXT: s_mov_b32 m0, -1 5115; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5116; GFX7-NEXT: v_mov_b32_e32 v0, s0 5117; GFX7-NEXT: v_mov_b32_e32 v1, s2 5118; GFX7-NEXT: v_mov_b32_e32 v2, s1 5119; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 5120; GFX7-NEXT: s_endpgm 5121; 5122; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: 5123; GFX10-WGP: ; %bb.0: ; %entry 5124; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 5125; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 5126; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 5127; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 5128; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 5129; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 5130; GFX10-WGP-NEXT: s_endpgm 5131; 5132; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: 5133; GFX10-CU: ; %bb.0: ; %entry 5134; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 5135; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5136; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 5137; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 5138; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 5139; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 5140; GFX10-CU-NEXT: s_endpgm 5141; 5142; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: 5143; SKIP-CACHE-INV: ; %bb.0: ; %entry 5144; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 5145; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 5146; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 5147; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5148; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 5149; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 5150; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 5151; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 5152; SKIP-CACHE-INV-NEXT: s_endpgm 5153; 5154; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: 5155; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 5156; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 5157; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5158; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 5159; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 5160; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 5161; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 5162; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 5163; 5164; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: 5165; GFX90A-TGSPLIT: ; %bb.0: ; %entry 5166; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 5167; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5168; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 5169; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 5170; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 5171; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 5172; GFX90A-TGSPLIT-NEXT: s_endpgm 5173 i32 addrspace(3)* %out, i32 %in, i32 %old) { 5174entry: 5175 %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 5176 %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst acquire 5177 ret void 5178} 5179 5180define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg( 5181; GFX6-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: 5182; GFX6: ; %bb.0: ; %entry 5183; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 5184; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 5185; GFX6-NEXT: s_mov_b32 m0, -1 5186; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5187; GFX6-NEXT: v_mov_b32_e32 v0, s2 5188; GFX6-NEXT: v_mov_b32_e32 v1, s1 5189; GFX6-NEXT: v_mov_b32_e32 v2, s0 5190; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 5191; GFX6-NEXT: s_endpgm 5192; 5193; GFX7-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: 5194; GFX7: ; %bb.0: ; %entry 5195; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 5196; GFX7-NEXT: s_mov_b32 m0, -1 5197; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5198; GFX7-NEXT: v_mov_b32_e32 v0, s0 5199; GFX7-NEXT: v_mov_b32_e32 v1, s2 5200; GFX7-NEXT: v_mov_b32_e32 v2, s1 5201; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 5202; GFX7-NEXT: s_endpgm 5203; 5204; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: 5205; GFX10-WGP: ; %bb.0: ; %entry 5206; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 5207; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 5208; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 5209; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 5210; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 5211; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 5212; GFX10-WGP-NEXT: s_endpgm 5213; 5214; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: 5215; GFX10-CU: ; %bb.0: ; %entry 5216; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 5217; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5218; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 5219; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 5220; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 5221; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 5222; GFX10-CU-NEXT: s_endpgm 5223; 5224; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: 5225; SKIP-CACHE-INV: ; %bb.0: ; %entry 5226; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 5227; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 5228; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 5229; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5230; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 5231; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 5232; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 5233; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 5234; SKIP-CACHE-INV-NEXT: s_endpgm 5235; 5236; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: 5237; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 5238; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 5239; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5240; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 5241; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 5242; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 5243; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 5244; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 5245; 5246; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: 5247; GFX90A-TGSPLIT: ; %bb.0: ; %entry 5248; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 5249; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5250; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 5251; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 5252; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 5253; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 5254; GFX90A-TGSPLIT-NEXT: s_endpgm 5255 i32 addrspace(3)* %out, i32 %in, i32 %old) { 5256entry: 5257 %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 5258 %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst seq_cst 5259 ret void 5260} 5261 5262define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg( 5263; GFX6-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg: 5264; GFX6: ; %bb.0: ; %entry 5265; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 5266; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 5267; GFX6-NEXT: s_mov_b32 m0, -1 5268; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5269; GFX6-NEXT: v_mov_b32_e32 v0, s2 5270; GFX6-NEXT: v_mov_b32_e32 v1, s1 5271; GFX6-NEXT: v_mov_b32_e32 v2, s0 5272; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 5273; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5274; GFX6-NEXT: ds_write_b32 v0, v1 5275; GFX6-NEXT: s_endpgm 5276; 5277; GFX7-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg: 5278; GFX7: ; %bb.0: ; %entry 5279; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 5280; GFX7-NEXT: s_mov_b32 m0, -1 5281; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5282; GFX7-NEXT: v_mov_b32_e32 v0, s0 5283; GFX7-NEXT: v_mov_b32_e32 v1, s2 5284; GFX7-NEXT: v_mov_b32_e32 v2, s1 5285; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 5286; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5287; GFX7-NEXT: ds_write_b32 v0, v1 5288; GFX7-NEXT: s_endpgm 5289; 5290; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg: 5291; GFX10-WGP: ; %bb.0: ; %entry 5292; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 5293; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 5294; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 5295; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 5296; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 5297; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 5298; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 5299; GFX10-WGP-NEXT: ds_write_b32 v0, v1 5300; GFX10-WGP-NEXT: s_endpgm 5301; 5302; GFX10-CU-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg: 5303; GFX10-CU: ; %bb.0: ; %entry 5304; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 5305; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5306; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 5307; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 5308; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 5309; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 5310; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5311; GFX10-CU-NEXT: ds_write_b32 v0, v1 5312; GFX10-CU-NEXT: s_endpgm 5313; 5314; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg: 5315; SKIP-CACHE-INV: ; %bb.0: ; %entry 5316; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 5317; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 5318; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 5319; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5320; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 5321; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 5322; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 5323; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 5324; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5325; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 5326; SKIP-CACHE-INV-NEXT: s_endpgm 5327; 5328; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg: 5329; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 5330; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 5331; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5332; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 5333; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 5334; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 5335; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 5336; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5337; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 5338; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 5339; 5340; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg: 5341; GFX90A-TGSPLIT: ; %bb.0: ; %entry 5342; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 5343; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5344; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 5345; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 5346; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 5347; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 5348; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5349; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 5350; GFX90A-TGSPLIT-NEXT: s_endpgm 5351 i32 addrspace(3)* %out, i32 %in, i32 %old) { 5352entry: 5353 %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 5354 %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire monotonic 5355 %val0 = extractvalue { i32, i1 } %val, 0 5356 store i32 %val0, i32 addrspace(3)* %out, align 4 5357 ret void 5358} 5359 5360define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( 5361; GFX6-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: 5362; GFX6: ; %bb.0: ; %entry 5363; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 5364; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 5365; GFX6-NEXT: s_mov_b32 m0, -1 5366; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5367; GFX6-NEXT: v_mov_b32_e32 v0, s2 5368; GFX6-NEXT: v_mov_b32_e32 v1, s1 5369; GFX6-NEXT: v_mov_b32_e32 v2, s0 5370; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 5371; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5372; GFX6-NEXT: ds_write_b32 v0, v1 5373; GFX6-NEXT: s_endpgm 5374; 5375; GFX7-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: 5376; GFX7: ; %bb.0: ; %entry 5377; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 5378; GFX7-NEXT: s_mov_b32 m0, -1 5379; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5380; GFX7-NEXT: v_mov_b32_e32 v0, s0 5381; GFX7-NEXT: v_mov_b32_e32 v1, s2 5382; GFX7-NEXT: v_mov_b32_e32 v2, s1 5383; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 5384; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5385; GFX7-NEXT: ds_write_b32 v0, v1 5386; GFX7-NEXT: s_endpgm 5387; 5388; GFX10-WGP-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: 5389; GFX10-WGP: ; %bb.0: ; %entry 5390; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 5391; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 5392; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 5393; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 5394; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 5395; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 5396; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 5397; GFX10-WGP-NEXT: ds_write_b32 v0, v1 5398; GFX10-WGP-NEXT: s_endpgm 5399; 5400; GFX10-CU-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: 5401; GFX10-CU: ; %bb.0: ; %entry 5402; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 5403; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5404; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 5405; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 5406; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 5407; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 5408; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5409; GFX10-CU-NEXT: ds_write_b32 v0, v1 5410; GFX10-CU-NEXT: s_endpgm 5411; 5412; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: 5413; SKIP-CACHE-INV: ; %bb.0: ; %entry 5414; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 5415; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 5416; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 5417; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5418; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 5419; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 5420; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 5421; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 5422; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5423; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 5424; SKIP-CACHE-INV-NEXT: s_endpgm 5425; 5426; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: 5427; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 5428; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 5429; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5430; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 5431; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 5432; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 5433; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 5434; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5435; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 5436; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 5437; 5438; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: 5439; GFX90A-TGSPLIT: ; %bb.0: ; %entry 5440; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 5441; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5442; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 5443; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 5444; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 5445; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 5446; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5447; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 5448; GFX90A-TGSPLIT-NEXT: s_endpgm 5449 i32 addrspace(3)* %out, i32 %in, i32 %old) { 5450entry: 5451 %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 5452 %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel monotonic 5453 %val0 = extractvalue { i32, i1 } %val, 0 5454 store i32 %val0, i32 addrspace(3)* %out, align 4 5455 ret void 5456} 5457 5458define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( 5459; GFX6-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: 5460; GFX6: ; %bb.0: ; %entry 5461; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 5462; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 5463; GFX6-NEXT: s_mov_b32 m0, -1 5464; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5465; GFX6-NEXT: v_mov_b32_e32 v0, s2 5466; GFX6-NEXT: v_mov_b32_e32 v1, s1 5467; GFX6-NEXT: v_mov_b32_e32 v2, s0 5468; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 5469; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5470; GFX6-NEXT: ds_write_b32 v0, v1 5471; GFX6-NEXT: s_endpgm 5472; 5473; GFX7-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: 5474; GFX7: ; %bb.0: ; %entry 5475; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 5476; GFX7-NEXT: s_mov_b32 m0, -1 5477; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5478; GFX7-NEXT: v_mov_b32_e32 v0, s0 5479; GFX7-NEXT: v_mov_b32_e32 v1, s2 5480; GFX7-NEXT: v_mov_b32_e32 v2, s1 5481; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 5482; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5483; GFX7-NEXT: ds_write_b32 v0, v1 5484; GFX7-NEXT: s_endpgm 5485; 5486; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: 5487; GFX10-WGP: ; %bb.0: ; %entry 5488; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 5489; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 5490; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 5491; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 5492; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 5493; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 5494; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 5495; GFX10-WGP-NEXT: ds_write_b32 v0, v1 5496; GFX10-WGP-NEXT: s_endpgm 5497; 5498; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: 5499; GFX10-CU: ; %bb.0: ; %entry 5500; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 5501; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5502; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 5503; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 5504; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 5505; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 5506; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5507; GFX10-CU-NEXT: ds_write_b32 v0, v1 5508; GFX10-CU-NEXT: s_endpgm 5509; 5510; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: 5511; SKIP-CACHE-INV: ; %bb.0: ; %entry 5512; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 5513; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 5514; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 5515; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5516; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 5517; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 5518; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 5519; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 5520; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5521; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 5522; SKIP-CACHE-INV-NEXT: s_endpgm 5523; 5524; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: 5525; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 5526; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 5527; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5528; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 5529; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 5530; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 5531; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 5532; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5533; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 5534; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 5535; 5536; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: 5537; GFX90A-TGSPLIT: ; %bb.0: ; %entry 5538; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 5539; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5540; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 5541; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 5542; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 5543; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 5544; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5545; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 5546; GFX90A-TGSPLIT-NEXT: s_endpgm 5547 i32 addrspace(3)* %out, i32 %in, i32 %old) { 5548entry: 5549 %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 5550 %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst monotonic 5551 %val0 = extractvalue { i32, i1 } %val, 0 5552 store i32 %val0, i32 addrspace(3)* %out, align 4 5553 ret void 5554} 5555 5556define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg( 5557; GFX6-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg: 5558; GFX6: ; %bb.0: ; %entry 5559; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 5560; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 5561; GFX6-NEXT: s_mov_b32 m0, -1 5562; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5563; GFX6-NEXT: v_mov_b32_e32 v0, s2 5564; GFX6-NEXT: v_mov_b32_e32 v1, s1 5565; GFX6-NEXT: v_mov_b32_e32 v2, s0 5566; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 5567; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5568; GFX6-NEXT: ds_write_b32 v0, v1 5569; GFX6-NEXT: s_endpgm 5570; 5571; GFX7-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg: 5572; GFX7: ; %bb.0: ; %entry 5573; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 5574; GFX7-NEXT: s_mov_b32 m0, -1 5575; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5576; GFX7-NEXT: v_mov_b32_e32 v0, s0 5577; GFX7-NEXT: v_mov_b32_e32 v1, s2 5578; GFX7-NEXT: v_mov_b32_e32 v2, s1 5579; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 5580; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5581; GFX7-NEXT: ds_write_b32 v0, v1 5582; GFX7-NEXT: s_endpgm 5583; 5584; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg: 5585; GFX10-WGP: ; %bb.0: ; %entry 5586; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 5587; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 5588; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 5589; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 5590; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 5591; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 5592; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 5593; GFX10-WGP-NEXT: ds_write_b32 v0, v1 5594; GFX10-WGP-NEXT: s_endpgm 5595; 5596; GFX10-CU-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg: 5597; GFX10-CU: ; %bb.0: ; %entry 5598; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 5599; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5600; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 5601; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 5602; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 5603; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 5604; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5605; GFX10-CU-NEXT: ds_write_b32 v0, v1 5606; GFX10-CU-NEXT: s_endpgm 5607; 5608; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg: 5609; SKIP-CACHE-INV: ; %bb.0: ; %entry 5610; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 5611; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 5612; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 5613; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5614; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 5615; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 5616; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 5617; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 5618; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5619; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 5620; SKIP-CACHE-INV-NEXT: s_endpgm 5621; 5622; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg: 5623; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 5624; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 5625; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5626; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 5627; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 5628; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 5629; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 5630; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5631; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 5632; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 5633; 5634; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg: 5635; GFX90A-TGSPLIT: ; %bb.0: ; %entry 5636; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 5637; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5638; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 5639; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 5640; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 5641; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 5642; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5643; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 5644; GFX90A-TGSPLIT-NEXT: s_endpgm 5645 i32 addrspace(3)* %out, i32 %in, i32 %old) { 5646entry: 5647 %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 5648 %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire acquire 5649 %val0 = extractvalue { i32, i1 } %val, 0 5650 store i32 %val0, i32 addrspace(3)* %out, align 4 5651 ret void 5652} 5653 5654define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg( 5655; GFX6-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg: 5656; GFX6: ; %bb.0: ; %entry 5657; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 5658; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 5659; GFX6-NEXT: s_mov_b32 m0, -1 5660; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5661; GFX6-NEXT: v_mov_b32_e32 v0, s2 5662; GFX6-NEXT: v_mov_b32_e32 v1, s1 5663; GFX6-NEXT: v_mov_b32_e32 v2, s0 5664; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 5665; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5666; GFX6-NEXT: ds_write_b32 v0, v1 5667; GFX6-NEXT: s_endpgm 5668; 5669; GFX7-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg: 5670; GFX7: ; %bb.0: ; %entry 5671; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 5672; GFX7-NEXT: s_mov_b32 m0, -1 5673; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5674; GFX7-NEXT: v_mov_b32_e32 v0, s0 5675; GFX7-NEXT: v_mov_b32_e32 v1, s2 5676; GFX7-NEXT: v_mov_b32_e32 v2, s1 5677; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 5678; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5679; GFX7-NEXT: ds_write_b32 v0, v1 5680; GFX7-NEXT: s_endpgm 5681; 5682; GFX10-WGP-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg: 5683; GFX10-WGP: ; %bb.0: ; %entry 5684; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 5685; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 5686; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 5687; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 5688; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 5689; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 5690; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 5691; GFX10-WGP-NEXT: ds_write_b32 v0, v1 5692; GFX10-WGP-NEXT: s_endpgm 5693; 5694; GFX10-CU-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg: 5695; GFX10-CU: ; %bb.0: ; %entry 5696; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 5697; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5698; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 5699; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 5700; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 5701; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 5702; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5703; GFX10-CU-NEXT: ds_write_b32 v0, v1 5704; GFX10-CU-NEXT: s_endpgm 5705; 5706; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg: 5707; SKIP-CACHE-INV: ; %bb.0: ; %entry 5708; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 5709; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 5710; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 5711; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5712; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 5713; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 5714; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 5715; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 5716; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5717; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 5718; SKIP-CACHE-INV-NEXT: s_endpgm 5719; 5720; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg: 5721; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 5722; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 5723; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5724; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 5725; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 5726; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 5727; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 5728; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5729; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 5730; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 5731; 5732; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg: 5733; GFX90A-TGSPLIT: ; %bb.0: ; %entry 5734; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 5735; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5736; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 5737; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 5738; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 5739; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 5740; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5741; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 5742; GFX90A-TGSPLIT-NEXT: s_endpgm 5743 i32 addrspace(3)* %out, i32 %in, i32 %old) { 5744entry: 5745 %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 5746 %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release acquire 5747 %val0 = extractvalue { i32, i1 } %val, 0 5748 store i32 %val0, i32 addrspace(3)* %out, align 4 5749 ret void 5750} 5751 5752define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( 5753; GFX6-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: 5754; GFX6: ; %bb.0: ; %entry 5755; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 5756; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 5757; GFX6-NEXT: s_mov_b32 m0, -1 5758; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5759; GFX6-NEXT: v_mov_b32_e32 v0, s2 5760; GFX6-NEXT: v_mov_b32_e32 v1, s1 5761; GFX6-NEXT: v_mov_b32_e32 v2, s0 5762; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 5763; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5764; GFX6-NEXT: ds_write_b32 v0, v1 5765; GFX6-NEXT: s_endpgm 5766; 5767; GFX7-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: 5768; GFX7: ; %bb.0: ; %entry 5769; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 5770; GFX7-NEXT: s_mov_b32 m0, -1 5771; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5772; GFX7-NEXT: v_mov_b32_e32 v0, s0 5773; GFX7-NEXT: v_mov_b32_e32 v1, s2 5774; GFX7-NEXT: v_mov_b32_e32 v2, s1 5775; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 5776; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5777; GFX7-NEXT: ds_write_b32 v0, v1 5778; GFX7-NEXT: s_endpgm 5779; 5780; GFX10-WGP-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: 5781; GFX10-WGP: ; %bb.0: ; %entry 5782; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 5783; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 5784; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 5785; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 5786; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 5787; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 5788; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 5789; GFX10-WGP-NEXT: ds_write_b32 v0, v1 5790; GFX10-WGP-NEXT: s_endpgm 5791; 5792; GFX10-CU-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: 5793; GFX10-CU: ; %bb.0: ; %entry 5794; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 5795; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5796; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 5797; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 5798; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 5799; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 5800; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5801; GFX10-CU-NEXT: ds_write_b32 v0, v1 5802; GFX10-CU-NEXT: s_endpgm 5803; 5804; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: 5805; SKIP-CACHE-INV: ; %bb.0: ; %entry 5806; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 5807; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 5808; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 5809; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5810; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 5811; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 5812; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 5813; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 5814; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5815; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 5816; SKIP-CACHE-INV-NEXT: s_endpgm 5817; 5818; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: 5819; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 5820; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 5821; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5822; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 5823; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 5824; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 5825; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 5826; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5827; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 5828; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 5829; 5830; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: 5831; GFX90A-TGSPLIT: ; %bb.0: ; %entry 5832; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 5833; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5834; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 5835; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 5836; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 5837; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 5838; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5839; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 5840; GFX90A-TGSPLIT-NEXT: s_endpgm 5841 i32 addrspace(3)* %out, i32 %in, i32 %old) { 5842entry: 5843 %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 5844 %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel acquire 5845 %val0 = extractvalue { i32, i1 } %val, 0 5846 store i32 %val0, i32 addrspace(3)* %out, align 4 5847 ret void 5848} 5849 5850define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( 5851; GFX6-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: 5852; GFX6: ; %bb.0: ; %entry 5853; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 5854; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 5855; GFX6-NEXT: s_mov_b32 m0, -1 5856; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5857; GFX6-NEXT: v_mov_b32_e32 v0, s2 5858; GFX6-NEXT: v_mov_b32_e32 v1, s1 5859; GFX6-NEXT: v_mov_b32_e32 v2, s0 5860; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 5861; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5862; GFX6-NEXT: ds_write_b32 v0, v1 5863; GFX6-NEXT: s_endpgm 5864; 5865; GFX7-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: 5866; GFX7: ; %bb.0: ; %entry 5867; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 5868; GFX7-NEXT: s_mov_b32 m0, -1 5869; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5870; GFX7-NEXT: v_mov_b32_e32 v0, s0 5871; GFX7-NEXT: v_mov_b32_e32 v1, s2 5872; GFX7-NEXT: v_mov_b32_e32 v2, s1 5873; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 5874; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5875; GFX7-NEXT: ds_write_b32 v0, v1 5876; GFX7-NEXT: s_endpgm 5877; 5878; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: 5879; GFX10-WGP: ; %bb.0: ; %entry 5880; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 5881; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 5882; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 5883; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 5884; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 5885; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 5886; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 5887; GFX10-WGP-NEXT: ds_write_b32 v0, v1 5888; GFX10-WGP-NEXT: s_endpgm 5889; 5890; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: 5891; GFX10-CU: ; %bb.0: ; %entry 5892; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 5893; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5894; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 5895; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 5896; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 5897; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 5898; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5899; GFX10-CU-NEXT: ds_write_b32 v0, v1 5900; GFX10-CU-NEXT: s_endpgm 5901; 5902; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: 5903; SKIP-CACHE-INV: ; %bb.0: ; %entry 5904; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 5905; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 5906; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 5907; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5908; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 5909; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 5910; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 5911; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 5912; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5913; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 5914; SKIP-CACHE-INV-NEXT: s_endpgm 5915; 5916; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: 5917; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 5918; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 5919; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5920; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 5921; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 5922; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 5923; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 5924; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5925; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 5926; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 5927; 5928; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: 5929; GFX90A-TGSPLIT: ; %bb.0: ; %entry 5930; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 5931; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5932; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 5933; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 5934; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 5935; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 5936; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5937; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 5938; GFX90A-TGSPLIT-NEXT: s_endpgm 5939 i32 addrspace(3)* %out, i32 %in, i32 %old) { 5940entry: 5941 %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 5942 %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst acquire 5943 %val0 = extractvalue { i32, i1 } %val, 0 5944 store i32 %val0, i32 addrspace(3)* %out, align 4 5945 ret void 5946} 5947 5948define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( 5949; GFX6-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: 5950; GFX6: ; %bb.0: ; %entry 5951; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 5952; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 5953; GFX6-NEXT: s_mov_b32 m0, -1 5954; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5955; GFX6-NEXT: v_mov_b32_e32 v0, s2 5956; GFX6-NEXT: v_mov_b32_e32 v1, s1 5957; GFX6-NEXT: v_mov_b32_e32 v2, s0 5958; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 5959; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5960; GFX6-NEXT: ds_write_b32 v0, v1 5961; GFX6-NEXT: s_endpgm 5962; 5963; GFX7-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: 5964; GFX7: ; %bb.0: ; %entry 5965; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 5966; GFX7-NEXT: s_mov_b32 m0, -1 5967; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5968; GFX7-NEXT: v_mov_b32_e32 v0, s0 5969; GFX7-NEXT: v_mov_b32_e32 v1, s2 5970; GFX7-NEXT: v_mov_b32_e32 v2, s1 5971; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 5972; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5973; GFX7-NEXT: ds_write_b32 v0, v1 5974; GFX7-NEXT: s_endpgm 5975; 5976; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: 5977; GFX10-WGP: ; %bb.0: ; %entry 5978; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 5979; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 5980; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 5981; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 5982; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 5983; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 5984; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 5985; GFX10-WGP-NEXT: ds_write_b32 v0, v1 5986; GFX10-WGP-NEXT: s_endpgm 5987; 5988; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: 5989; GFX10-CU: ; %bb.0: ; %entry 5990; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 5991; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5992; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 5993; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 5994; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 5995; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 5996; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5997; GFX10-CU-NEXT: ds_write_b32 v0, v1 5998; GFX10-CU-NEXT: s_endpgm 5999; 6000; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: 6001; SKIP-CACHE-INV: ; %bb.0: ; %entry 6002; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 6003; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 6004; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 6005; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 6006; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 6007; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 6008; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 6009; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 6010; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 6011; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 6012; SKIP-CACHE-INV-NEXT: s_endpgm 6013; 6014; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: 6015; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 6016; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 6017; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6018; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 6019; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 6020; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 6021; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 6022; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6023; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 6024; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 6025; 6026; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: 6027; GFX90A-TGSPLIT: ; %bb.0: ; %entry 6028; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 6029; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6030; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 6031; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 6032; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 6033; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 6034; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6035; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 6036; GFX90A-TGSPLIT-NEXT: s_endpgm 6037 i32 addrspace(3)* %out, i32 %in, i32 %old) { 6038entry: 6039 %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 6040 %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst seq_cst 6041 %val0 = extractvalue { i32, i1 } %val, 0 6042 store i32 %val0, i32 addrspace(3)* %out, align 4 6043 ret void 6044} 6045 6046