1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -mcpu=gfx90a -o - %s | FileCheck %s 3 4%S = type <{ float, double }> 5 6; The result of that atomic ops should not be used as a uniform value. 7 8define protected amdgpu_kernel void @add(i32 addrspace(1)* %p, %S addrspace(1)* %q) { 9; CHECK-LABEL: add: 10; CHECK: ; %bb.0: 11; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 12; CHECK-NEXT: v_mov_b32_e32 v0, 0 13; CHECK-NEXT: v_mov_b32_e32 v1, 1 14; CHECK-NEXT: s_waitcnt lgkmcnt(0) 15; CHECK-NEXT: global_atomic_add v2, v0, v1, s[0:1] glc 16; CHECK-NEXT: v_mov_b32_e32 v0, s2 17; CHECK-NEXT: v_mov_b32_e32 v1, s3 18; CHECK-NEXT: s_waitcnt vmcnt(0) 19; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] 20; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 21; CHECK-NEXT: global_store_dword v[0:1], v2, off 22; CHECK-NEXT: s_endpgm 23 %n32 = atomicrmw add i32 addrspace(1)* %p, i32 1 monotonic 24 %n64 = zext i32 %n32 to i64 25 %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0 26 store float 1.0, float addrspace(1)* %p1 27 ret void 28} 29 30define protected amdgpu_kernel void @sub(i32 addrspace(1)* %p, %S addrspace(1)* %q) { 31; CHECK-LABEL: sub: 32; CHECK: ; %bb.0: 33; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 34; CHECK-NEXT: v_mov_b32_e32 v0, 0 35; CHECK-NEXT: v_mov_b32_e32 v1, 1 36; CHECK-NEXT: s_waitcnt lgkmcnt(0) 37; CHECK-NEXT: global_atomic_sub v2, v0, v1, s[0:1] glc 38; CHECK-NEXT: v_mov_b32_e32 v0, s2 39; CHECK-NEXT: v_mov_b32_e32 v1, s3 40; CHECK-NEXT: s_waitcnt vmcnt(0) 41; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] 42; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 43; CHECK-NEXT: global_store_dword v[0:1], v2, off 44; CHECK-NEXT: s_endpgm 45 %n32 = atomicrmw sub i32 addrspace(1)* %p, i32 1 monotonic 46 %n64 = zext i32 %n32 to i64 47 %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0 48 store float 1.0, float addrspace(1)* %p1 49 ret void 50} 51 52define protected amdgpu_kernel void @and(i32 addrspace(1)* %p, %S addrspace(1)* %q) { 53; CHECK-LABEL: and: 54; CHECK: ; %bb.0: 55; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 56; CHECK-NEXT: v_mov_b32_e32 v0, 0 57; CHECK-NEXT: v_mov_b32_e32 v1, 1 58; CHECK-NEXT: s_waitcnt lgkmcnt(0) 59; CHECK-NEXT: global_atomic_and v2, v0, v1, s[0:1] glc 60; CHECK-NEXT: v_mov_b32_e32 v0, s2 61; CHECK-NEXT: v_mov_b32_e32 v1, s3 62; CHECK-NEXT: s_waitcnt vmcnt(0) 63; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] 64; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 65; CHECK-NEXT: global_store_dword v[0:1], v2, off 66; CHECK-NEXT: s_endpgm 67 %n32 = atomicrmw and i32 addrspace(1)* %p, i32 1 monotonic 68 %n64 = zext i32 %n32 to i64 69 %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0 70 store float 1.0, float addrspace(1)* %p1 71 ret void 72} 73 74define protected amdgpu_kernel void @or(i32 addrspace(1)* %p, %S addrspace(1)* %q) { 75; CHECK-LABEL: or: 76; CHECK: ; %bb.0: 77; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 78; CHECK-NEXT: v_mov_b32_e32 v0, 0 79; CHECK-NEXT: v_mov_b32_e32 v1, 1 80; CHECK-NEXT: s_waitcnt lgkmcnt(0) 81; CHECK-NEXT: global_atomic_or v2, v0, v1, s[0:1] glc 82; CHECK-NEXT: v_mov_b32_e32 v0, s2 83; CHECK-NEXT: v_mov_b32_e32 v1, s3 84; CHECK-NEXT: s_waitcnt vmcnt(0) 85; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] 86; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 87; CHECK-NEXT: global_store_dword v[0:1], v2, off 88; CHECK-NEXT: s_endpgm 89 %n32 = atomicrmw or i32 addrspace(1)* %p, i32 1 monotonic 90 %n64 = zext i32 %n32 to i64 91 %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0 92 store float 1.0, float addrspace(1)* %p1 93 ret void 94} 95 96define protected amdgpu_kernel void @xor(i32 addrspace(1)* %p, %S addrspace(1)* %q) { 97; CHECK-LABEL: xor: 98; CHECK: ; %bb.0: 99; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 100; CHECK-NEXT: v_mov_b32_e32 v0, 0 101; CHECK-NEXT: v_mov_b32_e32 v1, 1 102; CHECK-NEXT: s_waitcnt lgkmcnt(0) 103; CHECK-NEXT: global_atomic_xor v2, v0, v1, s[0:1] glc 104; CHECK-NEXT: v_mov_b32_e32 v0, s2 105; CHECK-NEXT: v_mov_b32_e32 v1, s3 106; CHECK-NEXT: s_waitcnt vmcnt(0) 107; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] 108; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 109; CHECK-NEXT: global_store_dword v[0:1], v2, off 110; CHECK-NEXT: s_endpgm 111 %n32 = atomicrmw xor i32 addrspace(1)* %p, i32 1 monotonic 112 %n64 = zext i32 %n32 to i64 113 %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0 114 store float 1.0, float addrspace(1)* %p1 115 ret void 116} 117 118define protected amdgpu_kernel void @nand(i32 addrspace(1)* %p, %S addrspace(1)* %q) { 119; CHECK-LABEL: nand: 120; CHECK: ; %bb.0: 121; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 122; CHECK-NEXT: s_mov_b64 s[4:5], 0 123; CHECK-NEXT: s_waitcnt lgkmcnt(0) 124; CHECK-NEXT: s_load_dword s6, s[0:1], 0x0 125; CHECK-NEXT: s_waitcnt lgkmcnt(0) 126; CHECK-NEXT: v_mov_b32_e32 v0, s6 127; CHECK-NEXT: BB5_1: ; %atomicrmw.start 128; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 129; CHECK-NEXT: v_mov_b32_e32 v1, v0 130; CHECK-NEXT: v_not_b32_e32 v0, v1 131; CHECK-NEXT: v_mov_b32_e32 v2, 0 132; CHECK-NEXT: v_or_b32_e32 v0, -2, v0 133; CHECK-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc 134; CHECK-NEXT: s_waitcnt vmcnt(0) 135; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 136; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 137; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5] 138; CHECK-NEXT: s_cbranch_execnz BB5_1 139; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end 140; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] 141; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[2:3] 142; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 143; CHECK-NEXT: global_store_dword v[0:1], v2, off 144; CHECK-NEXT: s_endpgm 145 %n32 = atomicrmw nand i32 addrspace(1)* %p, i32 1 monotonic 146 %n64 = zext i32 %n32 to i64 147 %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0 148 store float 1.0, float addrspace(1)* %p1 149 ret void 150} 151 152define protected amdgpu_kernel void @max(i32 addrspace(1)* %p, %S addrspace(1)* %q) { 153; CHECK-LABEL: max: 154; CHECK: ; %bb.0: 155; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 156; CHECK-NEXT: v_mov_b32_e32 v0, 0 157; CHECK-NEXT: v_mov_b32_e32 v1, 1 158; CHECK-NEXT: s_waitcnt lgkmcnt(0) 159; CHECK-NEXT: global_atomic_smax v2, v0, v1, s[0:1] glc 160; CHECK-NEXT: v_mov_b32_e32 v0, s2 161; CHECK-NEXT: v_mov_b32_e32 v1, s3 162; CHECK-NEXT: s_waitcnt vmcnt(0) 163; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] 164; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 165; CHECK-NEXT: global_store_dword v[0:1], v2, off 166; CHECK-NEXT: s_endpgm 167 %n32 = atomicrmw max i32 addrspace(1)* %p, i32 1 monotonic 168 %n64 = zext i32 %n32 to i64 169 %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0 170 store float 1.0, float addrspace(1)* %p1 171 ret void 172} 173 174define protected amdgpu_kernel void @min(i32 addrspace(1)* %p, %S addrspace(1)* %q) { 175; CHECK-LABEL: min: 176; CHECK: ; %bb.0: 177; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 178; CHECK-NEXT: v_mov_b32_e32 v0, 0 179; CHECK-NEXT: v_mov_b32_e32 v1, 1 180; CHECK-NEXT: s_waitcnt lgkmcnt(0) 181; CHECK-NEXT: global_atomic_smin v2, v0, v1, s[0:1] glc 182; CHECK-NEXT: v_mov_b32_e32 v0, s2 183; CHECK-NEXT: v_mov_b32_e32 v1, s3 184; CHECK-NEXT: s_waitcnt vmcnt(0) 185; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] 186; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 187; CHECK-NEXT: global_store_dword v[0:1], v2, off 188; CHECK-NEXT: s_endpgm 189 %n32 = atomicrmw min i32 addrspace(1)* %p, i32 1 monotonic 190 %n64 = zext i32 %n32 to i64 191 %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0 192 store float 1.0, float addrspace(1)* %p1 193 ret void 194} 195 196define protected amdgpu_kernel void @umax(i32 addrspace(1)* %p, %S addrspace(1)* %q) { 197; CHECK-LABEL: umax: 198; CHECK: ; %bb.0: 199; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 200; CHECK-NEXT: v_mov_b32_e32 v0, 0 201; CHECK-NEXT: v_mov_b32_e32 v1, 1 202; CHECK-NEXT: s_waitcnt lgkmcnt(0) 203; CHECK-NEXT: global_atomic_umax v2, v0, v1, s[0:1] glc 204; CHECK-NEXT: v_mov_b32_e32 v0, s2 205; CHECK-NEXT: v_mov_b32_e32 v1, s3 206; CHECK-NEXT: s_waitcnt vmcnt(0) 207; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] 208; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 209; CHECK-NEXT: global_store_dword v[0:1], v2, off 210; CHECK-NEXT: s_endpgm 211 %n32 = atomicrmw umax i32 addrspace(1)* %p, i32 1 monotonic 212 %n64 = zext i32 %n32 to i64 213 %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0 214 store float 1.0, float addrspace(1)* %p1 215 ret void 216} 217 218define protected amdgpu_kernel void @umin(i32 addrspace(1)* %p, %S addrspace(1)* %q) { 219; CHECK-LABEL: umin: 220; CHECK: ; %bb.0: 221; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 222; CHECK-NEXT: v_mov_b32_e32 v0, 0 223; CHECK-NEXT: v_mov_b32_e32 v1, 1 224; CHECK-NEXT: s_waitcnt lgkmcnt(0) 225; CHECK-NEXT: global_atomic_umin v2, v0, v1, s[0:1] glc 226; CHECK-NEXT: v_mov_b32_e32 v0, s2 227; CHECK-NEXT: v_mov_b32_e32 v1, s3 228; CHECK-NEXT: s_waitcnt vmcnt(0) 229; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] 230; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 231; CHECK-NEXT: global_store_dword v[0:1], v2, off 232; CHECK-NEXT: s_endpgm 233 %n32 = atomicrmw umin i32 addrspace(1)* %p, i32 1 monotonic 234 %n64 = zext i32 %n32 to i64 235 %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0 236 store float 1.0, float addrspace(1)* %p1 237 ret void 238} 239 240define protected amdgpu_kernel void @cmpxchg(i32 addrspace(1)* %p, %S addrspace(1)* %q) { 241; CHECK-LABEL: cmpxchg: 242; CHECK: ; %bb.0: 243; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 244; CHECK-NEXT: v_mov_b32_e32 v2, 0 245; CHECK-NEXT: v_mov_b32_e32 v0, 2 246; CHECK-NEXT: v_mov_b32_e32 v1, 1 247; CHECK-NEXT: s_waitcnt lgkmcnt(0) 248; CHECK-NEXT: global_atomic_cmpswap v2, v2, v[0:1], s[0:1] glc 249; CHECK-NEXT: v_mov_b32_e32 v0, s2 250; CHECK-NEXT: v_mov_b32_e32 v1, s3 251; CHECK-NEXT: s_waitcnt vmcnt(0) 252; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] 253; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 254; CHECK-NEXT: global_store_dword v[0:1], v2, off 255; CHECK-NEXT: s_endpgm 256 %agg = cmpxchg i32 addrspace(1)* %p, i32 1, i32 2 monotonic monotonic 257 %n32 = extractvalue {i32, i1} %agg, 0 258 %n64 = zext i32 %n32 to i64 259 %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0 260 store float 1.0, float addrspace(1)* %p1 261 ret void 262} 263 264define protected amdgpu_kernel void @xchg(i32 addrspace(1)* %p, %S addrspace(1)* %q) { 265; CHECK-LABEL: xchg: 266; CHECK: ; %bb.0: 267; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 268; CHECK-NEXT: v_mov_b32_e32 v0, 0 269; CHECK-NEXT: v_mov_b32_e32 v1, 1 270; CHECK-NEXT: s_waitcnt lgkmcnt(0) 271; CHECK-NEXT: global_atomic_swap v2, v0, v1, s[0:1] glc 272; CHECK-NEXT: v_mov_b32_e32 v0, s2 273; CHECK-NEXT: v_mov_b32_e32 v1, s3 274; CHECK-NEXT: s_waitcnt vmcnt(0) 275; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] 276; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 277; CHECK-NEXT: global_store_dword v[0:1], v2, off 278; CHECK-NEXT: s_endpgm 279 %n32 = atomicrmw xchg i32 addrspace(1)* %p, i32 1 monotonic 280 %n64 = zext i32 %n32 to i64 281 %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0 282 store float 1.0, float addrspace(1)* %p1 283 ret void 284} 285 286define protected amdgpu_kernel void @inc(i32 addrspace(1)* %p, %S addrspace(1)* %q) { 287; CHECK-LABEL: inc: 288; CHECK: ; %bb.0: 289; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 290; CHECK-NEXT: v_mov_b32_e32 v0, 0 291; CHECK-NEXT: s_waitcnt lgkmcnt(0) 292; CHECK-NEXT: global_atomic_inc v2, v0, v0, s[0:1] glc 293; CHECK-NEXT: v_mov_b32_e32 v0, s2 294; CHECK-NEXT: v_mov_b32_e32 v1, s3 295; CHECK-NEXT: s_waitcnt vmcnt(0) 296; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] 297; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 298; CHECK-NEXT: global_store_dword v[0:1], v2, off 299; CHECK-NEXT: s_endpgm 300 %n32 = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %p, i32 0, i32 0, i32 0, i1 false) 301 %n64 = zext i32 %n32 to i64 302 %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0 303 store float 1.0, float addrspace(1)* %p1 304 ret void 305} 306 307define protected amdgpu_kernel void @dec(i32 addrspace(1)* %p, %S addrspace(1)* %q) { 308; CHECK-LABEL: dec: 309; CHECK: ; %bb.0: 310; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 311; CHECK-NEXT: v_mov_b32_e32 v0, 0 312; CHECK-NEXT: s_waitcnt lgkmcnt(0) 313; CHECK-NEXT: global_atomic_dec v2, v0, v0, s[0:1] glc 314; CHECK-NEXT: v_mov_b32_e32 v0, s2 315; CHECK-NEXT: v_mov_b32_e32 v1, s3 316; CHECK-NEXT: s_waitcnt vmcnt(0) 317; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] 318; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 319; CHECK-NEXT: global_store_dword v[0:1], v2, off 320; CHECK-NEXT: s_endpgm 321 %n32 = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %p, i32 0, i32 0, i32 0, i1 false) 322 %n64 = zext i32 %n32 to i64 323 %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0 324 store float 1.0, float addrspace(1)* %p1 325 ret void 326} 327 328define protected amdgpu_kernel void @fadd(float addrspace(1)* %p, %S addrspace(1)* %q) { 329; CHECK-LABEL: fadd: 330; CHECK: ; %bb.0: 331; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 332; CHECK-NEXT: s_mov_b64 s[4:5], 0 333; CHECK-NEXT: s_waitcnt lgkmcnt(0) 334; CHECK-NEXT: s_load_dword s6, s[0:1], 0x0 335; CHECK-NEXT: s_waitcnt lgkmcnt(0) 336; CHECK-NEXT: v_mov_b32_e32 v0, s6 337; CHECK-NEXT: BB14_1: ; %atomicrmw.start 338; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 339; CHECK-NEXT: v_mov_b32_e32 v1, v0 340; CHECK-NEXT: v_mov_b32_e32 v2, 0 341; CHECK-NEXT: v_add_f32_e32 v0, 1.0, v1 342; CHECK-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc 343; CHECK-NEXT: s_waitcnt vmcnt(0) 344; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 345; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 346; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5] 347; CHECK-NEXT: s_cbranch_execnz BB14_1 348; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end 349; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] 350; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v0 351; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 352; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[2:3] 353; CHECK-NEXT: global_store_dword v[0:1], v2, off 354; CHECK-NEXT: s_endpgm 355 %f32 = atomicrmw fadd float addrspace(1)* %p, float 1.0 monotonic 356 %n32 = fptoui float %f32 to i32 357 %n64 = zext i32 %n32 to i64 358 %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0 359 store float 1.0, float addrspace(1)* %p1 360 ret void 361} 362 363define protected amdgpu_kernel void @fsub(float addrspace(1)* %p, %S addrspace(1)* %q) { 364; CHECK-LABEL: fsub: 365; CHECK: ; %bb.0: 366; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 367; CHECK-NEXT: s_mov_b64 s[4:5], 0 368; CHECK-NEXT: s_waitcnt lgkmcnt(0) 369; CHECK-NEXT: s_load_dword s6, s[0:1], 0x0 370; CHECK-NEXT: s_waitcnt lgkmcnt(0) 371; CHECK-NEXT: v_mov_b32_e32 v0, s6 372; CHECK-NEXT: BB15_1: ; %atomicrmw.start 373; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 374; CHECK-NEXT: v_mov_b32_e32 v1, v0 375; CHECK-NEXT: v_mov_b32_e32 v2, 0 376; CHECK-NEXT: v_add_f32_e32 v0, -1.0, v1 377; CHECK-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc 378; CHECK-NEXT: s_waitcnt vmcnt(0) 379; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 380; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 381; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5] 382; CHECK-NEXT: s_cbranch_execnz BB15_1 383; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end 384; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] 385; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v0 386; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 387; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[2:3] 388; CHECK-NEXT: global_store_dword v[0:1], v2, off 389; CHECK-NEXT: s_endpgm 390 %f32 = atomicrmw fsub float addrspace(1)* %p, float 1.0 monotonic 391 %n32 = fptoui float %f32 to i32 392 %n64 = zext i32 %n32 to i64 393 %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0 394 store float 1.0, float addrspace(1)* %p1 395 ret void 396} 397 398define protected amdgpu_kernel void @fmin(double addrspace(1)* %p, %S addrspace(1)* %q) { 399; CHECK-LABEL: fmin: 400; CHECK: ; %bb.0: 401; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 402; CHECK-NEXT: v_mov_b32_e32 v0, 0 403; CHECK-NEXT: v_mov_b32_e32 v2, 0 404; CHECK-NEXT: v_mov_b32_e32 v1, 0x3ff00000 405; CHECK-NEXT: s_waitcnt lgkmcnt(0) 406; CHECK-NEXT: global_atomic_min_f64 v[0:1], v2, v[0:1], s[0:1] glc 407; CHECK-NEXT: v_mov_b32_e32 v2, s2 408; CHECK-NEXT: v_mov_b32_e32 v3, s3 409; CHECK-NEXT: s_waitcnt vmcnt(0) 410; CHECK-NEXT: v_cvt_u32_f64_e32 v0, v[0:1] 411; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, v[2:3] 412; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 413; CHECK-NEXT: global_store_dword v[0:1], v2, off 414; CHECK-NEXT: s_endpgm 415 %f64 = call double @llvm.amdgcn.global.atomic.fmin.f64.p1f64.f64(double addrspace(1)* %p, double 1.0) 416 %n32 = fptoui double %f64 to i32 417 %n64 = zext i32 %n32 to i64 418 %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0 419 store float 1.0, float addrspace(1)* %p1 420 ret void 421} 422 423define protected amdgpu_kernel void @fmax(double addrspace(1)* %p, %S addrspace(1)* %q) { 424; CHECK-LABEL: fmax: 425; CHECK: ; %bb.0: 426; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 427; CHECK-NEXT: v_mov_b32_e32 v0, 0 428; CHECK-NEXT: v_mov_b32_e32 v2, 0 429; CHECK-NEXT: v_mov_b32_e32 v1, 0x3ff00000 430; CHECK-NEXT: s_waitcnt lgkmcnt(0) 431; CHECK-NEXT: global_atomic_max_f64 v[0:1], v2, v[0:1], s[0:1] glc 432; CHECK-NEXT: v_mov_b32_e32 v2, s2 433; CHECK-NEXT: v_mov_b32_e32 v3, s3 434; CHECK-NEXT: s_waitcnt vmcnt(0) 435; CHECK-NEXT: v_cvt_u32_f64_e32 v0, v[0:1] 436; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, v[2:3] 437; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 438; CHECK-NEXT: global_store_dword v[0:1], v2, off 439; CHECK-NEXT: s_endpgm 440 %f64 = call double @llvm.amdgcn.global.atomic.fmax.f64.p1f64.f64(double addrspace(1)* %p, double 1.0) 441 %n32 = fptoui double %f64 to i32 442 %n64 = zext i32 %n32 to i64 443 %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0 444 store float 1.0, float addrspace(1)* %p1 445 ret void 446} 447 448define protected amdgpu_kernel void @buffer.atomic.swap(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) { 449; CHECK-LABEL: buffer.atomic.swap: 450; CHECK: ; %bb.0: 451; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 452; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 453; CHECK-NEXT: v_mov_b32_e32 v0, 1 454; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c 455; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 456; CHECK-NEXT: s_waitcnt lgkmcnt(0) 457; CHECK-NEXT: v_mov_b32_e32 v1, s2 458; CHECK-NEXT: buffer_atomic_swap v0, v1, s[4:7], 0 offen glc 459; CHECK-NEXT: s_waitcnt vmcnt(0) 460; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] 461; CHECK-NEXT: global_store_dword v[0:1], v2, off 462; CHECK-NEXT: s_endpgm 463 %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.swap.i32(i32 1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) 464 %n64 = zext i32 %n32 to i64 465 %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0 466 store float 1.0, float addrspace(1)* %p1 467 ret void 468} 469 470define protected amdgpu_kernel void @buffer.atomic.add(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) { 471; CHECK-LABEL: buffer.atomic.add: 472; CHECK: ; %bb.0: 473; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 474; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 475; CHECK-NEXT: v_mov_b32_e32 v0, 1 476; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c 477; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 478; CHECK-NEXT: s_waitcnt lgkmcnt(0) 479; CHECK-NEXT: v_mov_b32_e32 v1, s2 480; CHECK-NEXT: buffer_atomic_add v0, v1, s[4:7], 0 offen glc 481; CHECK-NEXT: s_waitcnt vmcnt(0) 482; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] 483; CHECK-NEXT: global_store_dword v[0:1], v2, off 484; CHECK-NEXT: s_endpgm 485 %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32 1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) 486 %n64 = zext i32 %n32 to i64 487 %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0 488 store float 1.0, float addrspace(1)* %p1 489 ret void 490} 491 492define protected amdgpu_kernel void @buffer.atomic.sub(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) { 493; CHECK-LABEL: buffer.atomic.sub: 494; CHECK: ; %bb.0: 495; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 496; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 497; CHECK-NEXT: v_mov_b32_e32 v0, 1 498; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c 499; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 500; CHECK-NEXT: s_waitcnt lgkmcnt(0) 501; CHECK-NEXT: v_mov_b32_e32 v1, s2 502; CHECK-NEXT: buffer_atomic_sub v0, v1, s[4:7], 0 offen glc 503; CHECK-NEXT: s_waitcnt vmcnt(0) 504; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] 505; CHECK-NEXT: global_store_dword v[0:1], v2, off 506; CHECK-NEXT: s_endpgm 507 %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.sub.i32(i32 1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) 508 %n64 = zext i32 %n32 to i64 509 %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0 510 store float 1.0, float addrspace(1)* %p1 511 ret void 512} 513 514define protected amdgpu_kernel void @buffer.atomic.smin(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) { 515; CHECK-LABEL: buffer.atomic.smin: 516; CHECK: ; %bb.0: 517; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 518; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 519; CHECK-NEXT: v_mov_b32_e32 v0, 1 520; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c 521; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 522; CHECK-NEXT: s_waitcnt lgkmcnt(0) 523; CHECK-NEXT: v_mov_b32_e32 v1, s2 524; CHECK-NEXT: buffer_atomic_smin v0, v1, s[4:7], 0 offen glc 525; CHECK-NEXT: s_waitcnt vmcnt(0) 526; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] 527; CHECK-NEXT: global_store_dword v[0:1], v2, off 528; CHECK-NEXT: s_endpgm 529 %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.smin.i32(i32 1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) 530 %n64 = zext i32 %n32 to i64 531 %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0 532 store float 1.0, float addrspace(1)* %p1 533 ret void 534} 535 536define protected amdgpu_kernel void @buffer.atomic.smax(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) { 537; CHECK-LABEL: buffer.atomic.smax: 538; CHECK: ; %bb.0: 539; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 540; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 541; CHECK-NEXT: v_mov_b32_e32 v0, 1 542; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c 543; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 544; CHECK-NEXT: s_waitcnt lgkmcnt(0) 545; CHECK-NEXT: v_mov_b32_e32 v1, s2 546; CHECK-NEXT: buffer_atomic_smax v0, v1, s[4:7], 0 offen glc 547; CHECK-NEXT: s_waitcnt vmcnt(0) 548; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] 549; CHECK-NEXT: global_store_dword v[0:1], v2, off 550; CHECK-NEXT: s_endpgm 551 %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.smax.i32(i32 1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) 552 %n64 = zext i32 %n32 to i64 553 %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0 554 store float 1.0, float addrspace(1)* %p1 555 ret void 556} 557 558define protected amdgpu_kernel void @buffer.atomic.umin(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) { 559; CHECK-LABEL: buffer.atomic.umin: 560; CHECK: ; %bb.0: 561; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 562; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 563; CHECK-NEXT: v_mov_b32_e32 v0, 1 564; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c 565; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 566; CHECK-NEXT: s_waitcnt lgkmcnt(0) 567; CHECK-NEXT: v_mov_b32_e32 v1, s2 568; CHECK-NEXT: buffer_atomic_umin v0, v1, s[4:7], 0 offen glc 569; CHECK-NEXT: s_waitcnt vmcnt(0) 570; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] 571; CHECK-NEXT: global_store_dword v[0:1], v2, off 572; CHECK-NEXT: s_endpgm 573 %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.umin.i32(i32 1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) 574 %n64 = zext i32 %n32 to i64 575 %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0 576 store float 1.0, float addrspace(1)* %p1 577 ret void 578} 579 580define protected amdgpu_kernel void @buffer.atomic.umax(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) { 581; CHECK-LABEL: buffer.atomic.umax: 582; CHECK: ; %bb.0: 583; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 584; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 585; CHECK-NEXT: v_mov_b32_e32 v0, 1 586; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c 587; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 588; CHECK-NEXT: s_waitcnt lgkmcnt(0) 589; CHECK-NEXT: v_mov_b32_e32 v1, s2 590; CHECK-NEXT: buffer_atomic_umax v0, v1, s[4:7], 0 offen glc 591; CHECK-NEXT: s_waitcnt vmcnt(0) 592; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] 593; CHECK-NEXT: global_store_dword v[0:1], v2, off 594; CHECK-NEXT: s_endpgm 595 %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.umax.i32(i32 1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) 596 %n64 = zext i32 %n32 to i64 597 %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0 598 store float 1.0, float addrspace(1)* %p1 599 ret void 600} 601 602define protected amdgpu_kernel void @buffer.atomic.and(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) { 603; CHECK-LABEL: buffer.atomic.and: 604; CHECK: ; %bb.0: 605; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 606; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 607; CHECK-NEXT: v_mov_b32_e32 v0, 1 608; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c 609; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 610; CHECK-NEXT: s_waitcnt lgkmcnt(0) 611; CHECK-NEXT: v_mov_b32_e32 v1, s2 612; CHECK-NEXT: buffer_atomic_and v0, v1, s[4:7], 0 offen glc 613; CHECK-NEXT: s_waitcnt vmcnt(0) 614; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] 615; CHECK-NEXT: global_store_dword v[0:1], v2, off 616; CHECK-NEXT: s_endpgm 617 %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.and.i32(i32 1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) 618 %n64 = zext i32 %n32 to i64 619 %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0 620 store float 1.0, float addrspace(1)* %p1 621 ret void 622} 623 624define protected amdgpu_kernel void @buffer.atomic.or(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) { 625; CHECK-LABEL: buffer.atomic.or: 626; CHECK: ; %bb.0: 627; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 628; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 629; CHECK-NEXT: v_mov_b32_e32 v0, 1 630; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c 631; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 632; CHECK-NEXT: s_waitcnt lgkmcnt(0) 633; CHECK-NEXT: v_mov_b32_e32 v1, s2 634; CHECK-NEXT: buffer_atomic_or v0, v1, s[4:7], 0 offen glc 635; CHECK-NEXT: s_waitcnt vmcnt(0) 636; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] 637; CHECK-NEXT: global_store_dword v[0:1], v2, off 638; CHECK-NEXT: s_endpgm 639 %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.or.i32(i32 1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) 640 %n64 = zext i32 %n32 to i64 641 %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0 642 store float 1.0, float addrspace(1)* %p1 643 ret void 644} 645 646define protected amdgpu_kernel void @buffer.atomic.xor(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) { 647; CHECK-LABEL: buffer.atomic.xor: 648; CHECK: ; %bb.0: 649; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 650; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 651; CHECK-NEXT: v_mov_b32_e32 v0, 1 652; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c 653; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 654; CHECK-NEXT: s_waitcnt lgkmcnt(0) 655; CHECK-NEXT: v_mov_b32_e32 v1, s2 656; CHECK-NEXT: buffer_atomic_xor v0, v1, s[4:7], 0 offen glc 657; CHECK-NEXT: s_waitcnt vmcnt(0) 658; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] 659; CHECK-NEXT: global_store_dword v[0:1], v2, off 660; CHECK-NEXT: s_endpgm 661 %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.xor.i32(i32 1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) 662 %n64 = zext i32 %n32 to i64 663 %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0 664 store float 1.0, float addrspace(1)* %p1 665 ret void 666} 667 668define protected amdgpu_kernel void @buffer.atomic.inc(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) { 669; CHECK-LABEL: buffer.atomic.inc: 670; CHECK: ; %bb.0: 671; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 672; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 673; CHECK-NEXT: v_mov_b32_e32 v0, 1 674; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c 675; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 676; CHECK-NEXT: s_waitcnt lgkmcnt(0) 677; CHECK-NEXT: v_mov_b32_e32 v1, s2 678; CHECK-NEXT: buffer_atomic_inc v0, v1, s[4:7], 0 offen glc 679; CHECK-NEXT: s_waitcnt vmcnt(0) 680; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] 681; CHECK-NEXT: global_store_dword v[0:1], v2, off 682; CHECK-NEXT: s_endpgm 683 %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.inc.i32(i32 1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) 684 %n64 = zext i32 %n32 to i64 685 %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0 686 store float 1.0, float addrspace(1)* %p1 687 ret void 688} 689 690define protected amdgpu_kernel void @buffer.atomic.dec(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) { 691; CHECK-LABEL: buffer.atomic.dec: 692; CHECK: ; %bb.0: 693; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 694; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 695; CHECK-NEXT: v_mov_b32_e32 v0, 1 696; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c 697; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 698; CHECK-NEXT: s_waitcnt lgkmcnt(0) 699; CHECK-NEXT: v_mov_b32_e32 v1, s2 700; CHECK-NEXT: buffer_atomic_dec v0, v1, s[4:7], 0 offen glc 701; CHECK-NEXT: s_waitcnt vmcnt(0) 702; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] 703; CHECK-NEXT: global_store_dword v[0:1], v2, off 704; CHECK-NEXT: s_endpgm 705 %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.dec.i32(i32 1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) 706 %n64 = zext i32 %n32 to i64 707 %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0 708 store float 1.0, float addrspace(1)* %p1 709 ret void 710} 711 712define protected amdgpu_kernel void @buffer.atomic.cmpswap(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) { 713; CHECK-LABEL: buffer.atomic.cmpswap: 714; CHECK: ; %bb.0: 715; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 716; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 717; CHECK-NEXT: v_mov_b32_e32 v1, 2 718; CHECK-NEXT: v_mov_b32_e32 v0, 1 719; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c 720; CHECK-NEXT: s_waitcnt lgkmcnt(0) 721; CHECK-NEXT: v_mov_b32_e32 v2, s2 722; CHECK-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[4:7], 0 offen glc 723; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 724; CHECK-NEXT: s_waitcnt vmcnt(0) 725; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] 726; CHECK-NEXT: global_store_dword v[0:1], v2, off 727; CHECK-NEXT: s_endpgm 728 %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32 1, i32 2, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) 729 %n64 = zext i32 %n32 to i64 730 %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0 731 store float 1.0, float addrspace(1)* %p1 732 ret void 733} 734 735define protected amdgpu_kernel void @buffer.atomic.fadd(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) { 736; CHECK-LABEL: buffer.atomic.fadd: 737; CHECK: ; %bb.0: 738; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 739; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 740; CHECK-NEXT: v_mov_b32_e32 v0, 1.0 741; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c 742; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 743; CHECK-NEXT: s_waitcnt lgkmcnt(0) 744; CHECK-NEXT: v_mov_b32_e32 v1, s2 745; CHECK-NEXT: buffer_atomic_add_f32 v0, v1, s[4:7], 0 offen glc 746; CHECK-NEXT: s_waitcnt vmcnt(0) 747; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v0 748; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] 749; CHECK-NEXT: global_store_dword v[0:1], v2, off 750; CHECK-NEXT: s_endpgm 751 %f32 = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float 1.0, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) 752 %n32 = fptoui float %f32 to i32 753 %n64 = zext i32 %n32 to i64 754 %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0 755 store float 1.0, float addrspace(1)* %p1 756 ret void 757} 758 759define protected amdgpu_kernel void @buffer.atomic.fmin(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) { 760; CHECK-LABEL: buffer.atomic.fmin: 761; CHECK: ; %bb.0: 762; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 763; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 764; CHECK-NEXT: v_mov_b32_e32 v0, 0 765; CHECK-NEXT: v_mov_b32_e32 v1, 0x3ff00000 766; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c 767; CHECK-NEXT: s_waitcnt lgkmcnt(0) 768; CHECK-NEXT: v_mov_b32_e32 v2, s2 769; CHECK-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen glc 770; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 771; CHECK-NEXT: s_waitcnt vmcnt(0) 772; CHECK-NEXT: v_cvt_u32_f64_e32 v0, v[0:1] 773; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] 774; CHECK-NEXT: global_store_dword v[0:1], v2, off 775; CHECK-NEXT: s_endpgm 776 %f64 = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double 1.0, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) 777 %n32 = fptoui double %f64 to i32 778 %n64 = zext i32 %n32 to i64 779 %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0 780 store float 1.0, float addrspace(1)* %p1 781 ret void 782} 783 784define protected amdgpu_kernel void @buffer.atomic.fmax(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) { 785; CHECK-LABEL: buffer.atomic.fmax: 786; CHECK: ; %bb.0: 787; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 788; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 789; CHECK-NEXT: v_mov_b32_e32 v0, 0 790; CHECK-NEXT: v_mov_b32_e32 v1, 0x3ff00000 791; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c 792; CHECK-NEXT: s_waitcnt lgkmcnt(0) 793; CHECK-NEXT: v_mov_b32_e32 v2, s2 794; CHECK-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen glc 795; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 796; CHECK-NEXT: s_waitcnt vmcnt(0) 797; CHECK-NEXT: v_cvt_u32_f64_e32 v0, v[0:1] 798; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] 799; CHECK-NEXT: global_store_dword v[0:1], v2, off 800; CHECK-NEXT: s_endpgm 801 %f64 = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double 1.0, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) 802 %n32 = fptoui double %f64 to i32 803 %n64 = zext i32 %n32 to i64 804 %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0 805 store float 1.0, float addrspace(1)* %p1 806 ret void 807} 808 809declare i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)*, i32, i32 immarg, i32 immarg, i1 immarg) 810declare i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)*, i32, i32 immarg, i32 immarg, i1 immarg) 811declare double @llvm.amdgcn.global.atomic.fmin.f64.p1f64.f64(double addrspace(1)*, double) 812declare double @llvm.amdgcn.global.atomic.fmax.f64.p1f64.f64(double addrspace(1)*, double) 813declare i32 @llvm.amdgcn.raw.buffer.atomic.swap.i32(i32, <4 x i32>, i32, i32, i32) 814declare i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32, <4 x i32>, i32, i32, i32) 815declare i32 @llvm.amdgcn.raw.buffer.atomic.sub.i32(i32, <4 x i32>, i32, i32, i32) 816declare i32 @llvm.amdgcn.raw.buffer.atomic.smin.i32(i32, <4 x i32>, i32, i32, i32) 817declare i32 @llvm.amdgcn.raw.buffer.atomic.smax.i32(i32, <4 x i32>, i32, i32, i32) 818declare i32 @llvm.amdgcn.raw.buffer.atomic.umin.i32(i32, <4 x i32>, i32, i32, i32) 819declare i32 @llvm.amdgcn.raw.buffer.atomic.umax.i32(i32, <4 x i32>, i32, i32, i32) 820declare i32 @llvm.amdgcn.raw.buffer.atomic.and.i32(i32, <4 x i32>, i32, i32, i32) 821declare i32 @llvm.amdgcn.raw.buffer.atomic.or.i32(i32, <4 x i32>, i32, i32, i32) 822declare i32 @llvm.amdgcn.raw.buffer.atomic.xor.i32(i32, <4 x i32>, i32, i32, i32) 823declare i32 @llvm.amdgcn.raw.buffer.atomic.inc.i32(i32, <4 x i32>, i32, i32, i32) 824declare i32 @llvm.amdgcn.raw.buffer.atomic.dec.i32(i32, <4 x i32>, i32, i32, i32) 825declare i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32, i32, <4 x i32>, i32, i32, i32) 826declare float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32) 827declare double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double, <4 x i32>, i32, i32, i32) 828declare double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double, <4 x i32>, i32, i32, i32) 829