1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS %s 3; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s 4; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s 5; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s 6; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s 7 8declare i32 @llvm.amdgcn.workitem.id.x() 9 10@local_var32 = addrspace(3) global i32 undef, align 4 11@local_var64 = addrspace(3) global i64 undef, align 8 12 13; Show what the atomic optimization pass will do for local pointers. 14 15define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) { 16; 17; 18; GFX7LESS-LABEL: add_i32_constant: 19; GFX7LESS: ; %bb.0: ; %entry 20; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 21; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 22; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 23; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 24; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 25; GFX7LESS-NEXT: ; implicit-def: $vgpr1 26; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 27; GFX7LESS-NEXT: s_cbranch_execz BB0_2 28; GFX7LESS-NEXT: ; %bb.1: 29; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 30; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 31; GFX7LESS-NEXT: v_mul_u32_u24_e64 v2, s2, 5 32; GFX7LESS-NEXT: s_mov_b32 m0, -1 33; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 34; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2 35; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 36; GFX7LESS-NEXT: buffer_wbinvl1 37; GFX7LESS-NEXT: BB0_2: 38; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 39; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 40; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 41; GFX7LESS-NEXT: v_mad_u32_u24 v0, v0, 5, s2 42; GFX7LESS-NEXT: s_mov_b32 s2, -1 43; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 44; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 45; GFX7LESS-NEXT: s_endpgm 46; 47; GFX8-LABEL: add_i32_constant: 48; GFX8: ; %bb.0: ; %entry 49; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 50; GFX8-NEXT: s_mov_b64 s[2:3], exec 51; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 52; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 53; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 54; GFX8-NEXT: ; implicit-def: $vgpr1 55; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 56; GFX8-NEXT: s_cbranch_execz BB0_2 57; GFX8-NEXT: ; %bb.1: 58; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 59; GFX8-NEXT: v_mul_u32_u24_e64 v1, s2, 5 60; GFX8-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo 61; GFX8-NEXT: s_mov_b32 m0, -1 62; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 63; GFX8-NEXT: ds_add_rtn_u32 v1, v2, v1 64; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 65; GFX8-NEXT: buffer_wbinvl1_vol 66; GFX8-NEXT: BB0_2: 67; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 68; GFX8-NEXT: v_readfirstlane_b32 s2, v1 69; GFX8-NEXT: v_mad_u32_u24 v0, v0, 5, s2 70; GFX8-NEXT: s_mov_b32 s3, 0xf000 71; GFX8-NEXT: s_mov_b32 s2, -1 72; GFX8-NEXT: s_waitcnt lgkmcnt(0) 73; GFX8-NEXT: s_nop 0 74; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 75; GFX8-NEXT: s_endpgm 76; 77; GFX9-LABEL: add_i32_constant: 78; GFX9: ; %bb.0: ; %entry 79; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 80; GFX9-NEXT: s_mov_b64 s[2:3], exec 81; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 82; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 83; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 84; GFX9-NEXT: ; implicit-def: $vgpr1 85; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 86; GFX9-NEXT: s_cbranch_execz BB0_2 87; GFX9-NEXT: ; %bb.1: 88; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 89; GFX9-NEXT: v_mul_u32_u24_e64 v1, s2, 5 90; GFX9-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo 91; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 92; GFX9-NEXT: ds_add_rtn_u32 v1, v2, v1 93; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 94; GFX9-NEXT: buffer_wbinvl1_vol 95; GFX9-NEXT: BB0_2: 96; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 97; GFX9-NEXT: v_readfirstlane_b32 s2, v1 98; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s2 99; GFX9-NEXT: s_mov_b32 s3, 0xf000 100; GFX9-NEXT: s_mov_b32 s2, -1 101; GFX9-NEXT: s_waitcnt lgkmcnt(0) 102; GFX9-NEXT: s_nop 0 103; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 104; GFX9-NEXT: s_endpgm 105; 106; GFX1064-LABEL: add_i32_constant: 107; GFX1064: ; %bb.0: ; %entry 108; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 109; GFX1064-NEXT: s_mov_b64 s[2:3], exec 110; GFX1064-NEXT: ; implicit-def: $vgpr1 111; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 112; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 113; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 114; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 115; GFX1064-NEXT: s_cbranch_execz BB0_2 116; GFX1064-NEXT: ; %bb.1: 117; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 118; GFX1064-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo 119; GFX1064-NEXT: v_mul_u32_u24_e64 v1, s2, 5 120; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 121; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 122; GFX1064-NEXT: ds_add_rtn_u32 v1, v2, v1 123; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 124; GFX1064-NEXT: buffer_gl0_inv 125; GFX1064-NEXT: buffer_gl1_inv 126; GFX1064-NEXT: BB0_2: 127; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 128; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 129; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 130; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 131; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s2 132; GFX1064-NEXT: s_mov_b32 s2, -1 133; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 134; GFX1064-NEXT: s_nop 0 135; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 136; GFX1064-NEXT: s_endpgm 137; 138; GFX1032-LABEL: add_i32_constant: 139; GFX1032: ; %bb.0: ; %entry 140; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 141; GFX1032-NEXT: s_mov_b32 s2, exec_lo 142; GFX1032-NEXT: ; implicit-def: $vcc_hi 143; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 144; GFX1032-NEXT: ; implicit-def: $vgpr1 145; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 146; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 147; GFX1032-NEXT: s_cbranch_execz BB0_2 148; GFX1032-NEXT: ; %bb.1: 149; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s2 150; GFX1032-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo 151; GFX1032-NEXT: v_mul_u32_u24_e64 v1, s2, 5 152; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 153; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 154; GFX1032-NEXT: ds_add_rtn_u32 v1, v2, v1 155; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 156; GFX1032-NEXT: buffer_gl0_inv 157; GFX1032-NEXT: buffer_gl1_inv 158; GFX1032-NEXT: BB0_2: 159; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 160; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 161; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 162; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 163; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s2 164; GFX1032-NEXT: s_mov_b32 s2, -1 165; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 166; GFX1032-NEXT: s_nop 0 167; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 168; GFX1032-NEXT: s_endpgm 169entry: 170 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 5 acq_rel 171 store i32 %old, i32 addrspace(1)* %out 172 ret void 173} 174 175define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive) { 176; 177; 178; GFX7LESS-LABEL: add_i32_uniform: 179; GFX7LESS: ; %bb.0: ; %entry 180; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 181; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 182; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0xb 183; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 184; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 185; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 186; GFX7LESS-NEXT: ; implicit-def: $vgpr1 187; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc 188; GFX7LESS-NEXT: s_cbranch_execz BB1_2 189; GFX7LESS-NEXT: ; %bb.1: 190; GFX7LESS-NEXT: s_bcnt1_i32_b64 s3, s[6:7] 191; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 192; GFX7LESS-NEXT: s_mul_i32 s3, s2, s3 193; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 194; GFX7LESS-NEXT: v_mov_b32_e32 v2, s3 195; GFX7LESS-NEXT: s_mov_b32 m0, -1 196; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 197; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2 198; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 199; GFX7LESS-NEXT: buffer_wbinvl1 200; GFX7LESS-NEXT: BB1_2: 201; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] 202; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 203; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 204; GFX7LESS-NEXT: v_mul_lo_u32 v0, s2, v0 205; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 206; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s0, v0 207; GFX7LESS-NEXT: s_mov_b32 s6, -1 208; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 209; GFX7LESS-NEXT: s_endpgm 210; 211; GFX8-LABEL: add_i32_uniform: 212; GFX8: ; %bb.0: ; %entry 213; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 214; GFX8-NEXT: s_load_dword s0, s[0:1], 0x2c 215; GFX8-NEXT: s_mov_b64 s[2:3], exec 216; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 217; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 218; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 219; GFX8-NEXT: ; implicit-def: $vgpr1 220; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc 221; GFX8-NEXT: s_cbranch_execz BB1_2 222; GFX8-NEXT: ; %bb.1: 223; GFX8-NEXT: s_bcnt1_i32_b64 s1, s[2:3] 224; GFX8-NEXT: s_waitcnt lgkmcnt(0) 225; GFX8-NEXT: s_mul_i32 s1, s0, s1 226; GFX8-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 227; GFX8-NEXT: v_mov_b32_e32 v2, s1 228; GFX8-NEXT: s_mov_b32 m0, -1 229; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 230; GFX8-NEXT: ds_add_rtn_u32 v1, v1, v2 231; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 232; GFX8-NEXT: buffer_wbinvl1_vol 233; GFX8-NEXT: BB1_2: 234; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 235; GFX8-NEXT: s_waitcnt lgkmcnt(0) 236; GFX8-NEXT: v_mul_lo_u32 v0, s0, v0 237; GFX8-NEXT: v_readfirstlane_b32 s0, v1 238; GFX8-NEXT: s_mov_b32 s7, 0xf000 239; GFX8-NEXT: s_mov_b32 s6, -1 240; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 241; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 242; GFX8-NEXT: s_endpgm 243; 244; GFX9-LABEL: add_i32_uniform: 245; GFX9: ; %bb.0: ; %entry 246; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 247; GFX9-NEXT: s_load_dword s0, s[0:1], 0x2c 248; GFX9-NEXT: s_mov_b64 s[2:3], exec 249; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 250; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 251; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 252; GFX9-NEXT: ; implicit-def: $vgpr1 253; GFX9-NEXT: s_and_saveexec_b64 s[6:7], vcc 254; GFX9-NEXT: s_cbranch_execz BB1_2 255; GFX9-NEXT: ; %bb.1: 256; GFX9-NEXT: s_bcnt1_i32_b64 s1, s[2:3] 257; GFX9-NEXT: s_waitcnt lgkmcnt(0) 258; GFX9-NEXT: s_mul_i32 s1, s0, s1 259; GFX9-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 260; GFX9-NEXT: v_mov_b32_e32 v2, s1 261; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 262; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 263; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 264; GFX9-NEXT: buffer_wbinvl1_vol 265; GFX9-NEXT: BB1_2: 266; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] 267; GFX9-NEXT: s_waitcnt lgkmcnt(0) 268; GFX9-NEXT: v_mul_lo_u32 v0, s0, v0 269; GFX9-NEXT: v_readfirstlane_b32 s0, v1 270; GFX9-NEXT: s_mov_b32 s7, 0xf000 271; GFX9-NEXT: s_mov_b32 s6, -1 272; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 273; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 274; GFX9-NEXT: s_endpgm 275; 276; GFX1064-LABEL: add_i32_uniform: 277; GFX1064: ; %bb.0: ; %entry 278; GFX1064-NEXT: s_clause 0x1 279; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 280; GFX1064-NEXT: s_load_dword s0, s[0:1], 0x2c 281; GFX1064-NEXT: s_mov_b64 s[2:3], exec 282; GFX1064-NEXT: ; implicit-def: $vgpr1 283; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 284; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 285; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 286; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], vcc 287; GFX1064-NEXT: s_cbranch_execz BB1_2 288; GFX1064-NEXT: ; %bb.1: 289; GFX1064-NEXT: s_bcnt1_i32_b64 s1, s[2:3] 290; GFX1064-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 291; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 292; GFX1064-NEXT: s_mul_i32 s1, s0, s1 293; GFX1064-NEXT: v_mov_b32_e32 v2, s1 294; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 295; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 296; GFX1064-NEXT: ds_add_rtn_u32 v1, v1, v2 297; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 298; GFX1064-NEXT: buffer_gl0_inv 299; GFX1064-NEXT: buffer_gl1_inv 300; GFX1064-NEXT: BB1_2: 301; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 302; GFX1064-NEXT: s_or_b64 exec, exec, s[6:7] 303; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 304; GFX1064-NEXT: v_mul_lo_u32 v0, s0, v0 305; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 306; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 307; GFX1064-NEXT: s_mov_b32 s6, -1 308; GFX1064-NEXT: v_add_nc_u32_e32 v0, s0, v0 309; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0 310; GFX1064-NEXT: s_endpgm 311; 312; GFX1032-LABEL: add_i32_uniform: 313; GFX1032: ; %bb.0: ; %entry 314; GFX1032-NEXT: s_clause 0x1 315; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 316; GFX1032-NEXT: s_load_dword s0, s[0:1], 0x2c 317; GFX1032-NEXT: s_mov_b32 s2, exec_lo 318; GFX1032-NEXT: ; implicit-def: $vcc_hi 319; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 320; GFX1032-NEXT: ; implicit-def: $vgpr1 321; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 322; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo 323; GFX1032-NEXT: s_cbranch_execz BB1_2 324; GFX1032-NEXT: ; %bb.1: 325; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s2 326; GFX1032-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 327; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 328; GFX1032-NEXT: s_mul_i32 s2, s0, s2 329; GFX1032-NEXT: v_mov_b32_e32 v2, s2 330; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 331; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 332; GFX1032-NEXT: ds_add_rtn_u32 v1, v1, v2 333; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 334; GFX1032-NEXT: buffer_gl0_inv 335; GFX1032-NEXT: buffer_gl1_inv 336; GFX1032-NEXT: BB1_2: 337; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 338; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 339; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 340; GFX1032-NEXT: v_mul_lo_u32 v0, s0, v0 341; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 342; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 343; GFX1032-NEXT: s_mov_b32 s6, -1 344; GFX1032-NEXT: v_add_nc_u32_e32 v0, s0, v0 345; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 346; GFX1032-NEXT: s_endpgm 347entry: 348 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %additive acq_rel 349 store i32 %old, i32 addrspace(1)* %out 350 ret void 351} 352 353define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) { 354; 355; 356; GFX7LESS-LABEL: add_i32_varying: 357; GFX7LESS: ; %bb.0: ; %entry 358; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 359; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 360; GFX7LESS-NEXT: s_mov_b32 m0, -1 361; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 362; GFX7LESS-NEXT: ds_add_rtn_u32 v0, v1, v0 363; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 364; GFX7LESS-NEXT: buffer_wbinvl1 365; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 366; GFX7LESS-NEXT: s_mov_b32 s2, -1 367; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 368; GFX7LESS-NEXT: s_endpgm 369; 370; GFX8-LABEL: add_i32_varying: 371; GFX8: ; %bb.0: ; %entry 372; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 373; GFX8-NEXT: v_mov_b32_e32 v2, v0 374; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 375; GFX8-NEXT: v_mov_b32_e32 v1, 0 376; GFX8-NEXT: s_mov_b64 exec, s[2:3] 377; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 378; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 379; GFX8-NEXT: s_not_b64 exec, exec 380; GFX8-NEXT: v_mov_b32_e32 v2, 0 381; GFX8-NEXT: s_not_b64 exec, exec 382; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 383; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 384; GFX8-NEXT: s_nop 1 385; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 386; GFX8-NEXT: s_nop 1 387; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 388; GFX8-NEXT: s_nop 1 389; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 390; GFX8-NEXT: s_nop 1 391; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 392; GFX8-NEXT: s_nop 1 393; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 394; GFX8-NEXT: v_readlane_b32 s4, v2, 63 395; GFX8-NEXT: s_nop 0 396; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 397; GFX8-NEXT: s_mov_b64 exec, s[2:3] 398; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 399; GFX8-NEXT: ; implicit-def: $vgpr0 400; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 401; GFX8-NEXT: s_cbranch_execz BB2_2 402; GFX8-NEXT: ; %bb.1: 403; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 404; GFX8-NEXT: v_mov_b32_e32 v3, s4 405; GFX8-NEXT: s_mov_b32 m0, -1 406; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 407; GFX8-NEXT: ds_add_rtn_u32 v0, v0, v3 408; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 409; GFX8-NEXT: buffer_wbinvl1_vol 410; GFX8-NEXT: BB2_2: 411; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 412; GFX8-NEXT: v_readfirstlane_b32 s2, v0 413; GFX8-NEXT: v_mov_b32_e32 v0, v1 414; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 415; GFX8-NEXT: s_mov_b32 s3, 0xf000 416; GFX8-NEXT: s_mov_b32 s2, -1 417; GFX8-NEXT: s_waitcnt lgkmcnt(0) 418; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 419; GFX8-NEXT: s_endpgm 420; 421; GFX9-LABEL: add_i32_varying: 422; GFX9: ; %bb.0: ; %entry 423; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 424; GFX9-NEXT: v_mov_b32_e32 v2, v0 425; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 426; GFX9-NEXT: v_mov_b32_e32 v1, 0 427; GFX9-NEXT: s_mov_b64 exec, s[2:3] 428; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 429; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 430; GFX9-NEXT: s_not_b64 exec, exec 431; GFX9-NEXT: v_mov_b32_e32 v2, 0 432; GFX9-NEXT: s_not_b64 exec, exec 433; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 434; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 435; GFX9-NEXT: s_nop 1 436; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 437; GFX9-NEXT: s_nop 1 438; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 439; GFX9-NEXT: s_nop 1 440; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 441; GFX9-NEXT: s_nop 1 442; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 443; GFX9-NEXT: s_nop 1 444; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 445; GFX9-NEXT: v_readlane_b32 s4, v2, 63 446; GFX9-NEXT: s_nop 0 447; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 448; GFX9-NEXT: s_mov_b64 exec, s[2:3] 449; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 450; GFX9-NEXT: ; implicit-def: $vgpr0 451; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 452; GFX9-NEXT: s_cbranch_execz BB2_2 453; GFX9-NEXT: ; %bb.1: 454; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 455; GFX9-NEXT: v_mov_b32_e32 v3, s4 456; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 457; GFX9-NEXT: ds_add_rtn_u32 v0, v0, v3 458; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 459; GFX9-NEXT: buffer_wbinvl1_vol 460; GFX9-NEXT: BB2_2: 461; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 462; GFX9-NEXT: v_readfirstlane_b32 s2, v0 463; GFX9-NEXT: v_mov_b32_e32 v0, v1 464; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 465; GFX9-NEXT: s_mov_b32 s3, 0xf000 466; GFX9-NEXT: s_mov_b32 s2, -1 467; GFX9-NEXT: s_waitcnt lgkmcnt(0) 468; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 469; GFX9-NEXT: s_endpgm 470; 471; GFX1064-LABEL: add_i32_varying: 472; GFX1064: ; %bb.0: ; %entry 473; GFX1064-NEXT: v_mov_b32_e32 v1, v0 474; GFX1064-NEXT: s_not_b64 exec, exec 475; GFX1064-NEXT: v_mov_b32_e32 v1, 0 476; GFX1064-NEXT: s_not_b64 exec, exec 477; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 478; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 479; GFX1064-NEXT: v_mov_b32_e32 v3, 0 480; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 481; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 482; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 483; GFX1064-NEXT: v_mov_b32_e32 v2, v1 484; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 485; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 486; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 487; GFX1064-NEXT: v_mov_b32_e32 v2, s4 488; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 489; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 490; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 491; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 492; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 493; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 494; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 495; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 496; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 497; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 498; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 499; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 500; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 501; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 502; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 503; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 504; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 505; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 506; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 507; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 508; GFX1064-NEXT: s_mov_b32 s2, -1 509; GFX1064-NEXT: ; implicit-def: $vgpr0 510; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 511; GFX1064-NEXT: s_cbranch_execz BB2_2 512; GFX1064-NEXT: ; %bb.1: 513; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 514; GFX1064-NEXT: v_mov_b32_e32 v4, s7 515; GFX1064-NEXT: s_mov_b32 s3, s7 516; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 517; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 518; GFX1064-NEXT: ds_add_rtn_u32 v0, v7, v4 519; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 520; GFX1064-NEXT: buffer_gl0_inv 521; GFX1064-NEXT: buffer_gl1_inv 522; GFX1064-NEXT: BB2_2: 523; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 524; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 525; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 526; GFX1064-NEXT: v_mov_b32_e32 v0, v3 527; GFX1064-NEXT: v_add_nc_u32_e32 v0, s3, v0 528; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 529; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 530; GFX1064-NEXT: s_nop 0 531; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 532; GFX1064-NEXT: s_endpgm 533; 534; GFX1032-LABEL: add_i32_varying: 535; GFX1032: ; %bb.0: ; %entry 536; GFX1032-NEXT: v_mov_b32_e32 v1, v0 537; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 538; GFX1032-NEXT: v_mov_b32_e32 v1, 0 539; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 540; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 541; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 542; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 543; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 544; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 545; GFX1032-NEXT: v_mov_b32_e32 v2, v1 546; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 547; GFX1032-NEXT: s_mov_b32 exec_lo, s2 548; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 549; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 550; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 551; GFX1032-NEXT: v_mov_b32_e32 v3, 0 552; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 553; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 554; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 555; GFX1032-NEXT: s_mov_b32 exec_lo, s2 556; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 557; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 558; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 559; GFX1032-NEXT: s_mov_b32 exec_lo, s2 560; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 561; GFX1032-NEXT: s_mov_b32 s2, -1 562; GFX1032-NEXT: ; implicit-def: $vgpr0 563; GFX1032-NEXT: ; implicit-def: $vcc_hi 564; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 565; GFX1032-NEXT: s_cbranch_execz BB2_2 566; GFX1032-NEXT: ; %bb.1: 567; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 568; GFX1032-NEXT: v_mov_b32_e32 v4, s4 569; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 570; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 571; GFX1032-NEXT: ds_add_rtn_u32 v0, v7, v4 572; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 573; GFX1032-NEXT: buffer_gl0_inv 574; GFX1032-NEXT: buffer_gl1_inv 575; GFX1032-NEXT: BB2_2: 576; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 577; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 578; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 579; GFX1032-NEXT: v_mov_b32_e32 v0, v3 580; GFX1032-NEXT: v_add_nc_u32_e32 v0, s3, v0 581; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 582; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 583; GFX1032-NEXT: s_nop 0 584; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 585; GFX1032-NEXT: s_endpgm 586entry: 587 %lane = call i32 @llvm.amdgcn.workitem.id.x() 588 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel 589 store i32 %old, i32 addrspace(1)* %out 590 ret void 591} 592 593define amdgpu_kernel void @add_i32_varying_gfx1032(i32 addrspace(1)* %out) { 594; 595; 596; GFX7LESS-LABEL: add_i32_varying_gfx1032: 597; GFX7LESS: ; %bb.0: ; %entry 598; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 599; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 600; GFX7LESS-NEXT: s_mov_b32 m0, -1 601; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 602; GFX7LESS-NEXT: ds_add_rtn_u32 v0, v1, v0 603; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 604; GFX7LESS-NEXT: buffer_wbinvl1 605; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 606; GFX7LESS-NEXT: s_mov_b32 s2, -1 607; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 608; GFX7LESS-NEXT: s_endpgm 609; 610; GFX8-LABEL: add_i32_varying_gfx1032: 611; GFX8: ; %bb.0: ; %entry 612; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 613; GFX8-NEXT: v_mov_b32_e32 v2, v0 614; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 615; GFX8-NEXT: v_mov_b32_e32 v1, 0 616; GFX8-NEXT: s_mov_b64 exec, s[2:3] 617; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 618; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 619; GFX8-NEXT: s_not_b64 exec, exec 620; GFX8-NEXT: v_mov_b32_e32 v2, 0 621; GFX8-NEXT: s_not_b64 exec, exec 622; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 623; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 624; GFX8-NEXT: s_nop 1 625; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 626; GFX8-NEXT: s_nop 1 627; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 628; GFX8-NEXT: s_nop 1 629; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 630; GFX8-NEXT: s_nop 1 631; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 632; GFX8-NEXT: s_nop 1 633; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 634; GFX8-NEXT: v_readlane_b32 s4, v2, 63 635; GFX8-NEXT: s_nop 0 636; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 637; GFX8-NEXT: s_mov_b64 exec, s[2:3] 638; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 639; GFX8-NEXT: ; implicit-def: $vgpr0 640; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 641; GFX8-NEXT: s_cbranch_execz BB3_2 642; GFX8-NEXT: ; %bb.1: 643; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 644; GFX8-NEXT: v_mov_b32_e32 v3, s4 645; GFX8-NEXT: s_mov_b32 m0, -1 646; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 647; GFX8-NEXT: ds_add_rtn_u32 v0, v0, v3 648; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 649; GFX8-NEXT: buffer_wbinvl1_vol 650; GFX8-NEXT: BB3_2: 651; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 652; GFX8-NEXT: v_readfirstlane_b32 s2, v0 653; GFX8-NEXT: v_mov_b32_e32 v0, v1 654; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 655; GFX8-NEXT: s_mov_b32 s3, 0xf000 656; GFX8-NEXT: s_mov_b32 s2, -1 657; GFX8-NEXT: s_waitcnt lgkmcnt(0) 658; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 659; GFX8-NEXT: s_endpgm 660; 661; GFX9-LABEL: add_i32_varying_gfx1032: 662; GFX9: ; %bb.0: ; %entry 663; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 664; GFX9-NEXT: v_mov_b32_e32 v2, v0 665; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 666; GFX9-NEXT: v_mov_b32_e32 v1, 0 667; GFX9-NEXT: s_mov_b64 exec, s[2:3] 668; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 669; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 670; GFX9-NEXT: s_not_b64 exec, exec 671; GFX9-NEXT: v_mov_b32_e32 v2, 0 672; GFX9-NEXT: s_not_b64 exec, exec 673; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 674; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 675; GFX9-NEXT: s_nop 1 676; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 677; GFX9-NEXT: s_nop 1 678; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 679; GFX9-NEXT: s_nop 1 680; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 681; GFX9-NEXT: s_nop 1 682; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 683; GFX9-NEXT: s_nop 1 684; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 685; GFX9-NEXT: v_readlane_b32 s4, v2, 63 686; GFX9-NEXT: s_nop 0 687; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 688; GFX9-NEXT: s_mov_b64 exec, s[2:3] 689; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 690; GFX9-NEXT: ; implicit-def: $vgpr0 691; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 692; GFX9-NEXT: s_cbranch_execz BB3_2 693; GFX9-NEXT: ; %bb.1: 694; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 695; GFX9-NEXT: v_mov_b32_e32 v3, s4 696; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 697; GFX9-NEXT: ds_add_rtn_u32 v0, v0, v3 698; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 699; GFX9-NEXT: buffer_wbinvl1_vol 700; GFX9-NEXT: BB3_2: 701; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 702; GFX9-NEXT: v_readfirstlane_b32 s2, v0 703; GFX9-NEXT: v_mov_b32_e32 v0, v1 704; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 705; GFX9-NEXT: s_mov_b32 s3, 0xf000 706; GFX9-NEXT: s_mov_b32 s2, -1 707; GFX9-NEXT: s_waitcnt lgkmcnt(0) 708; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 709; GFX9-NEXT: s_endpgm 710; 711; GFX1064-LABEL: add_i32_varying_gfx1032: 712; GFX1064: ; %bb.0: ; %entry 713; GFX1064-NEXT: v_mov_b32_e32 v1, v0 714; GFX1064-NEXT: s_not_b64 exec, exec 715; GFX1064-NEXT: v_mov_b32_e32 v1, 0 716; GFX1064-NEXT: s_not_b64 exec, exec 717; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 718; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 719; GFX1064-NEXT: v_mov_b32_e32 v3, 0 720; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 721; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 722; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 723; GFX1064-NEXT: v_mov_b32_e32 v2, v1 724; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 725; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 726; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 727; GFX1064-NEXT: v_mov_b32_e32 v2, s4 728; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 729; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 730; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 731; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 732; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 733; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 734; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 735; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 736; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 737; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 738; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 739; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 740; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 741; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 742; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 743; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 744; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 745; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 746; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 747; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 748; GFX1064-NEXT: s_mov_b32 s2, -1 749; GFX1064-NEXT: ; implicit-def: $vgpr0 750; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 751; GFX1064-NEXT: s_cbranch_execz BB3_2 752; GFX1064-NEXT: ; %bb.1: 753; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 754; GFX1064-NEXT: v_mov_b32_e32 v4, s7 755; GFX1064-NEXT: s_mov_b32 s3, s7 756; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 757; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 758; GFX1064-NEXT: ds_add_rtn_u32 v0, v7, v4 759; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 760; GFX1064-NEXT: buffer_gl0_inv 761; GFX1064-NEXT: buffer_gl1_inv 762; GFX1064-NEXT: BB3_2: 763; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 764; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 765; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 766; GFX1064-NEXT: v_mov_b32_e32 v0, v3 767; GFX1064-NEXT: v_add_nc_u32_e32 v0, s3, v0 768; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 769; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 770; GFX1064-NEXT: s_nop 0 771; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 772; GFX1064-NEXT: s_endpgm 773; 774; GFX1032-LABEL: add_i32_varying_gfx1032: 775; GFX1032: ; %bb.0: ; %entry 776; GFX1032-NEXT: v_mov_b32_e32 v1, v0 777; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 778; GFX1032-NEXT: v_mov_b32_e32 v1, 0 779; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 780; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 781; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 782; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 783; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 784; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 785; GFX1032-NEXT: v_mov_b32_e32 v2, v1 786; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 787; GFX1032-NEXT: s_mov_b32 exec_lo, s2 788; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 789; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 790; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 791; GFX1032-NEXT: v_mov_b32_e32 v3, 0 792; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 793; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 794; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 795; GFX1032-NEXT: s_mov_b32 exec_lo, s2 796; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 797; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 798; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 799; GFX1032-NEXT: s_mov_b32 exec_lo, s2 800; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 801; GFX1032-NEXT: s_mov_b32 s2, -1 802; GFX1032-NEXT: ; implicit-def: $vgpr0 803; GFX1032-NEXT: ; implicit-def: $vcc_hi 804; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 805; GFX1032-NEXT: s_cbranch_execz BB3_2 806; GFX1032-NEXT: ; %bb.1: 807; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 808; GFX1032-NEXT: v_mov_b32_e32 v4, s4 809; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 810; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 811; GFX1032-NEXT: ds_add_rtn_u32 v0, v7, v4 812; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 813; GFX1032-NEXT: buffer_gl0_inv 814; GFX1032-NEXT: buffer_gl1_inv 815; GFX1032-NEXT: BB3_2: 816; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 817; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 818; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 819; GFX1032-NEXT: v_mov_b32_e32 v0, v3 820; GFX1032-NEXT: v_add_nc_u32_e32 v0, s3, v0 821; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 822; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 823; GFX1032-NEXT: s_nop 0 824; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 825; GFX1032-NEXT: s_endpgm 826entry: 827 %lane = call i32 @llvm.amdgcn.workitem.id.x() 828 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel 829 store i32 %old, i32 addrspace(1)* %out 830 ret void 831} 832 833define amdgpu_kernel void @add_i32_varying_gfx1064(i32 addrspace(1)* %out) { 834; 835; 836; GFX7LESS-LABEL: add_i32_varying_gfx1064: 837; GFX7LESS: ; %bb.0: ; %entry 838; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 839; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 840; GFX7LESS-NEXT: s_mov_b32 m0, -1 841; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 842; GFX7LESS-NEXT: ds_add_rtn_u32 v0, v1, v0 843; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 844; GFX7LESS-NEXT: buffer_wbinvl1 845; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 846; GFX7LESS-NEXT: s_mov_b32 s2, -1 847; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 848; GFX7LESS-NEXT: s_endpgm 849; 850; GFX8-LABEL: add_i32_varying_gfx1064: 851; GFX8: ; %bb.0: ; %entry 852; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 853; GFX8-NEXT: v_mov_b32_e32 v2, v0 854; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 855; GFX8-NEXT: v_mov_b32_e32 v1, 0 856; GFX8-NEXT: s_mov_b64 exec, s[2:3] 857; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 858; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 859; GFX8-NEXT: s_not_b64 exec, exec 860; GFX8-NEXT: v_mov_b32_e32 v2, 0 861; GFX8-NEXT: s_not_b64 exec, exec 862; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 863; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 864; GFX8-NEXT: s_nop 1 865; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 866; GFX8-NEXT: s_nop 1 867; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 868; GFX8-NEXT: s_nop 1 869; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 870; GFX8-NEXT: s_nop 1 871; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 872; GFX8-NEXT: s_nop 1 873; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 874; GFX8-NEXT: v_readlane_b32 s4, v2, 63 875; GFX8-NEXT: s_nop 0 876; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 877; GFX8-NEXT: s_mov_b64 exec, s[2:3] 878; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 879; GFX8-NEXT: ; implicit-def: $vgpr0 880; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 881; GFX8-NEXT: s_cbranch_execz BB4_2 882; GFX8-NEXT: ; %bb.1: 883; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 884; GFX8-NEXT: v_mov_b32_e32 v3, s4 885; GFX8-NEXT: s_mov_b32 m0, -1 886; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 887; GFX8-NEXT: ds_add_rtn_u32 v0, v0, v3 888; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 889; GFX8-NEXT: buffer_wbinvl1_vol 890; GFX8-NEXT: BB4_2: 891; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 892; GFX8-NEXT: v_readfirstlane_b32 s2, v0 893; GFX8-NEXT: v_mov_b32_e32 v0, v1 894; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 895; GFX8-NEXT: s_mov_b32 s3, 0xf000 896; GFX8-NEXT: s_mov_b32 s2, -1 897; GFX8-NEXT: s_waitcnt lgkmcnt(0) 898; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 899; GFX8-NEXT: s_endpgm 900; 901; GFX9-LABEL: add_i32_varying_gfx1064: 902; GFX9: ; %bb.0: ; %entry 903; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 904; GFX9-NEXT: v_mov_b32_e32 v2, v0 905; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 906; GFX9-NEXT: v_mov_b32_e32 v1, 0 907; GFX9-NEXT: s_mov_b64 exec, s[2:3] 908; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 909; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 910; GFX9-NEXT: s_not_b64 exec, exec 911; GFX9-NEXT: v_mov_b32_e32 v2, 0 912; GFX9-NEXT: s_not_b64 exec, exec 913; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 914; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 915; GFX9-NEXT: s_nop 1 916; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 917; GFX9-NEXT: s_nop 1 918; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 919; GFX9-NEXT: s_nop 1 920; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 921; GFX9-NEXT: s_nop 1 922; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 923; GFX9-NEXT: s_nop 1 924; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 925; GFX9-NEXT: v_readlane_b32 s4, v2, 63 926; GFX9-NEXT: s_nop 0 927; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 928; GFX9-NEXT: s_mov_b64 exec, s[2:3] 929; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 930; GFX9-NEXT: ; implicit-def: $vgpr0 931; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 932; GFX9-NEXT: s_cbranch_execz BB4_2 933; GFX9-NEXT: ; %bb.1: 934; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 935; GFX9-NEXT: v_mov_b32_e32 v3, s4 936; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 937; GFX9-NEXT: ds_add_rtn_u32 v0, v0, v3 938; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 939; GFX9-NEXT: buffer_wbinvl1_vol 940; GFX9-NEXT: BB4_2: 941; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 942; GFX9-NEXT: v_readfirstlane_b32 s2, v0 943; GFX9-NEXT: v_mov_b32_e32 v0, v1 944; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 945; GFX9-NEXT: s_mov_b32 s3, 0xf000 946; GFX9-NEXT: s_mov_b32 s2, -1 947; GFX9-NEXT: s_waitcnt lgkmcnt(0) 948; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 949; GFX9-NEXT: s_endpgm 950; 951; GFX1064-LABEL: add_i32_varying_gfx1064: 952; GFX1064: ; %bb.0: ; %entry 953; GFX1064-NEXT: v_mov_b32_e32 v1, v0 954; GFX1064-NEXT: s_not_b64 exec, exec 955; GFX1064-NEXT: v_mov_b32_e32 v1, 0 956; GFX1064-NEXT: s_not_b64 exec, exec 957; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 958; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 959; GFX1064-NEXT: v_mov_b32_e32 v3, 0 960; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 961; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 962; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 963; GFX1064-NEXT: v_mov_b32_e32 v2, v1 964; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 965; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 966; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 967; GFX1064-NEXT: v_mov_b32_e32 v2, s4 968; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 969; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 970; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 971; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 972; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 973; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 974; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 975; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 976; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 977; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 978; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 979; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 980; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 981; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 982; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 983; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 984; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 985; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 986; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 987; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 988; GFX1064-NEXT: s_mov_b32 s2, -1 989; GFX1064-NEXT: ; implicit-def: $vgpr0 990; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 991; GFX1064-NEXT: s_cbranch_execz BB4_2 992; GFX1064-NEXT: ; %bb.1: 993; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 994; GFX1064-NEXT: v_mov_b32_e32 v4, s7 995; GFX1064-NEXT: s_mov_b32 s3, s7 996; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 997; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 998; GFX1064-NEXT: ds_add_rtn_u32 v0, v7, v4 999; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1000; GFX1064-NEXT: buffer_gl0_inv 1001; GFX1064-NEXT: buffer_gl1_inv 1002; GFX1064-NEXT: BB4_2: 1003; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1004; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 1005; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 1006; GFX1064-NEXT: v_mov_b32_e32 v0, v3 1007; GFX1064-NEXT: v_add_nc_u32_e32 v0, s3, v0 1008; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1009; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1010; GFX1064-NEXT: s_nop 0 1011; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 1012; GFX1064-NEXT: s_endpgm 1013; 1014; GFX1032-LABEL: add_i32_varying_gfx1064: 1015; GFX1032: ; %bb.0: ; %entry 1016; GFX1032-NEXT: v_mov_b32_e32 v1, v0 1017; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 1018; GFX1032-NEXT: v_mov_b32_e32 v1, 0 1019; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 1020; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 1021; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 1022; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 1023; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 1024; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 1025; GFX1032-NEXT: v_mov_b32_e32 v2, v1 1026; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1027; GFX1032-NEXT: s_mov_b32 exec_lo, s2 1028; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1029; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 1030; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 1031; GFX1032-NEXT: v_mov_b32_e32 v3, 0 1032; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 1033; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 1034; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 1035; GFX1032-NEXT: s_mov_b32 exec_lo, s2 1036; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 1037; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 1038; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 1039; GFX1032-NEXT: s_mov_b32 exec_lo, s2 1040; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1041; GFX1032-NEXT: s_mov_b32 s2, -1 1042; GFX1032-NEXT: ; implicit-def: $vgpr0 1043; GFX1032-NEXT: ; implicit-def: $vcc_hi 1044; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 1045; GFX1032-NEXT: s_cbranch_execz BB4_2 1046; GFX1032-NEXT: ; %bb.1: 1047; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 1048; GFX1032-NEXT: v_mov_b32_e32 v4, s4 1049; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1050; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1051; GFX1032-NEXT: ds_add_rtn_u32 v0, v7, v4 1052; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1053; GFX1032-NEXT: buffer_gl0_inv 1054; GFX1032-NEXT: buffer_gl1_inv 1055; GFX1032-NEXT: BB4_2: 1056; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1057; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 1058; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 1059; GFX1032-NEXT: v_mov_b32_e32 v0, v3 1060; GFX1032-NEXT: v_add_nc_u32_e32 v0, s3, v0 1061; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1062; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1063; GFX1032-NEXT: s_nop 0 1064; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 1065; GFX1032-NEXT: s_endpgm 1066entry: 1067 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1068 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel 1069 store i32 %old, i32 addrspace(1)* %out 1070 ret void 1071} 1072 1073define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { 1074; 1075; 1076; GFX7LESS-LABEL: add_i64_constant: 1077; GFX7LESS: ; %bb.0: ; %entry 1078; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec 1079; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1080; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 1081; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 1082; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1083; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 1084; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 1085; GFX7LESS-NEXT: s_cbranch_execz BB5_2 1086; GFX7LESS-NEXT: ; %bb.1: 1087; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1088; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1089; GFX7LESS-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 1090; GFX7LESS-NEXT: v_mul_u32_u24_e64 v1, s4, 5 1091; GFX7LESS-NEXT: s_mov_b32 m0, -1 1092; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1093; GFX7LESS-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1094; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1095; GFX7LESS-NEXT: buffer_wbinvl1 1096; GFX7LESS-NEXT: BB5_2: 1097; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 1098; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 1099; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v2 1100; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 1101; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1102; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1103; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 1104; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s2, v0 1105; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc 1106; GFX7LESS-NEXT: s_mov_b32 s2, -1 1107; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1108; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1109; GFX7LESS-NEXT: s_endpgm 1110; 1111; GFX8-LABEL: add_i64_constant: 1112; GFX8: ; %bb.0: ; %entry 1113; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1114; GFX8-NEXT: s_mov_b64 s[4:5], exec 1115; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1116; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 1117; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1118; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 1119; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 1120; GFX8-NEXT: s_cbranch_execz BB5_2 1121; GFX8-NEXT: ; %bb.1: 1122; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1123; GFX8-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 1124; GFX8-NEXT: v_mul_u32_u24_e64 v1, s4, 5 1125; GFX8-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1126; GFX8-NEXT: s_mov_b32 m0, -1 1127; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1128; GFX8-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1129; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1130; GFX8-NEXT: buffer_wbinvl1_vol 1131; GFX8-NEXT: BB5_2: 1132; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 1133; GFX8-NEXT: v_readfirstlane_b32 s2, v1 1134; GFX8-NEXT: v_readfirstlane_b32 s3, v2 1135; GFX8-NEXT: v_mov_b32_e32 v1, s2 1136; GFX8-NEXT: v_mov_b32_e32 v2, s3 1137; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v0, 5, v[1:2] 1138; GFX8-NEXT: s_mov_b32 s3, 0xf000 1139; GFX8-NEXT: s_mov_b32 s2, -1 1140; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1141; GFX8-NEXT: s_nop 1 1142; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1143; GFX8-NEXT: s_endpgm 1144; 1145; GFX9-LABEL: add_i64_constant: 1146; GFX9: ; %bb.0: ; %entry 1147; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1148; GFX9-NEXT: s_mov_b64 s[4:5], exec 1149; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1150; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 1151; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1152; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 1153; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 1154; GFX9-NEXT: s_cbranch_execz BB5_2 1155; GFX9-NEXT: ; %bb.1: 1156; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1157; GFX9-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 1158; GFX9-NEXT: v_mul_u32_u24_e64 v1, s4, 5 1159; GFX9-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1160; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1161; GFX9-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1162; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1163; GFX9-NEXT: buffer_wbinvl1_vol 1164; GFX9-NEXT: BB5_2: 1165; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 1166; GFX9-NEXT: v_readfirstlane_b32 s2, v1 1167; GFX9-NEXT: v_readfirstlane_b32 s3, v2 1168; GFX9-NEXT: v_mov_b32_e32 v1, s2 1169; GFX9-NEXT: v_mov_b32_e32 v2, s3 1170; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v0, 5, v[1:2] 1171; GFX9-NEXT: s_mov_b32 s3, 0xf000 1172; GFX9-NEXT: s_mov_b32 s2, -1 1173; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1174; GFX9-NEXT: s_nop 1 1175; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1176; GFX9-NEXT: s_endpgm 1177; 1178; GFX1064-LABEL: add_i64_constant: 1179; GFX1064: ; %bb.0: ; %entry 1180; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1181; GFX1064-NEXT: s_mov_b64 s[4:5], exec 1182; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2 1183; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 1184; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s5, v0 1185; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1186; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 1187; GFX1064-NEXT: s_cbranch_execz BB5_2 1188; GFX1064-NEXT: ; %bb.1: 1189; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1190; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1191; GFX1064-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 1192; GFX1064-NEXT: v_mul_u32_u24_e64 v1, s4, 5 1193; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1194; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1195; GFX1064-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1196; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1197; GFX1064-NEXT: buffer_gl0_inv 1198; GFX1064-NEXT: buffer_gl1_inv 1199; GFX1064-NEXT: BB5_2: 1200; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1201; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 1202; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 1203; GFX1064-NEXT: v_readfirstlane_b32 s3, v2 1204; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v0, 5, s[2:3] 1205; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1206; GFX1064-NEXT: s_mov_b32 s2, -1 1207; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1208; GFX1064-NEXT: s_nop 1 1209; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1210; GFX1064-NEXT: s_endpgm 1211; 1212; GFX1032-LABEL: add_i64_constant: 1213; GFX1032: ; %bb.0: ; %entry 1214; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1215; GFX1032-NEXT: s_mov_b32 s3, exec_lo 1216; GFX1032-NEXT: ; implicit-def: $vcc_hi 1217; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 1218; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2 1219; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1220; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 1221; GFX1032-NEXT: s_cbranch_execz BB5_2 1222; GFX1032-NEXT: ; %bb.1: 1223; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 1224; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1225; GFX1032-NEXT: v_mul_hi_u32_u24_e64 v2, s3, 5 1226; GFX1032-NEXT: v_mul_u32_u24_e64 v1, s3, 5 1227; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1228; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1229; GFX1032-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1230; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1231; GFX1032-NEXT: buffer_gl0_inv 1232; GFX1032-NEXT: buffer_gl1_inv 1233; GFX1032-NEXT: BB5_2: 1234; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1235; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 1236; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 1237; GFX1032-NEXT: v_readfirstlane_b32 s3, v2 1238; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, v0, 5, s[2:3] 1239; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1240; GFX1032-NEXT: s_mov_b32 s2, -1 1241; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1242; GFX1032-NEXT: s_nop 1 1243; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1244; GFX1032-NEXT: s_endpgm 1245entry: 1246 %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 5 acq_rel 1247 store i64 %old, i64 addrspace(1)* %out 1248 ret void 1249} 1250 1251define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive) { 1252; 1253; 1254; GFX7LESS-LABEL: add_i64_uniform: 1255; GFX7LESS: ; %bb.0: ; %entry 1256; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 1257; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1258; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 1259; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 1260; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1261; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 1262; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 1263; GFX7LESS-NEXT: s_cbranch_execz BB6_2 1264; GFX7LESS-NEXT: ; %bb.1: 1265; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1266; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1267; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1268; GFX7LESS-NEXT: s_mul_i32 s7, s3, s6 1269; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 1270; GFX7LESS-NEXT: v_mul_hi_u32 v1, s2, v1 1271; GFX7LESS-NEXT: s_mul_i32 s6, s2, s6 1272; GFX7LESS-NEXT: v_add_i32_e32 v2, vcc, s7, v1 1273; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 1274; GFX7LESS-NEXT: s_mov_b32 m0, -1 1275; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1276; GFX7LESS-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1277; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1278; GFX7LESS-NEXT: buffer_wbinvl1 1279; GFX7LESS-NEXT: BB6_2: 1280; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 1281; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 1282; GFX7LESS-NEXT: s_mov_b32 s6, -1 1283; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1284; GFX7LESS-NEXT: s_mov_b32 s4, s0 1285; GFX7LESS-NEXT: s_mov_b32 s5, s1 1286; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 1287; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v2 1288; GFX7LESS-NEXT: v_mul_lo_u32 v1, s3, v0 1289; GFX7LESS-NEXT: v_mul_hi_u32 v2, s2, v0 1290; GFX7LESS-NEXT: v_mul_lo_u32 v0, s2, v0 1291; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v2, v1 1292; GFX7LESS-NEXT: v_mov_b32_e32 v2, s1 1293; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s0, v0 1294; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc 1295; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1296; GFX7LESS-NEXT: s_endpgm 1297; 1298; GFX8-LABEL: add_i64_uniform: 1299; GFX8: ; %bb.0: ; %entry 1300; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1301; GFX8-NEXT: s_mov_b64 s[6:7], exec 1302; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1303; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 1304; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1305; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 1306; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 1307; GFX8-NEXT: s_cbranch_execz BB6_2 1308; GFX8-NEXT: ; %bb.1: 1309; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1310; GFX8-NEXT: v_mov_b32_e32 v1, s6 1311; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1312; GFX8-NEXT: v_mul_hi_u32 v1, s2, v1 1313; GFX8-NEXT: s_mul_i32 s7, s3, s6 1314; GFX8-NEXT: s_mul_i32 s6, s2, s6 1315; GFX8-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1316; GFX8-NEXT: v_add_u32_e32 v2, vcc, s7, v1 1317; GFX8-NEXT: v_mov_b32_e32 v1, s6 1318; GFX8-NEXT: s_mov_b32 m0, -1 1319; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1320; GFX8-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1321; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1322; GFX8-NEXT: buffer_wbinvl1_vol 1323; GFX8-NEXT: BB6_2: 1324; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1325; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1326; GFX8-NEXT: s_mov_b32 s4, s0 1327; GFX8-NEXT: v_readfirstlane_b32 s0, v1 1328; GFX8-NEXT: v_mul_lo_u32 v1, s3, v0 1329; GFX8-NEXT: v_mul_hi_u32 v3, s2, v0 1330; GFX8-NEXT: v_mul_lo_u32 v0, s2, v0 1331; GFX8-NEXT: s_mov_b32 s5, s1 1332; GFX8-NEXT: v_readfirstlane_b32 s1, v2 1333; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 1334; GFX8-NEXT: v_mov_b32_e32 v2, s1 1335; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 1336; GFX8-NEXT: s_mov_b32 s7, 0xf000 1337; GFX8-NEXT: s_mov_b32 s6, -1 1338; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc 1339; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1340; GFX8-NEXT: s_endpgm 1341; 1342; GFX9-LABEL: add_i64_uniform: 1343; GFX9: ; %bb.0: ; %entry 1344; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1345; GFX9-NEXT: s_mov_b64 s[6:7], exec 1346; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1347; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 1348; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1349; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 1350; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 1351; GFX9-NEXT: s_cbranch_execz BB6_2 1352; GFX9-NEXT: ; %bb.1: 1353; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1354; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1355; GFX9-NEXT: s_mul_i32 s7, s3, s6 1356; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 1357; GFX9-NEXT: s_add_i32 s8, s8, s7 1358; GFX9-NEXT: s_mul_i32 s6, s2, s6 1359; GFX9-NEXT: v_mov_b32_e32 v1, s6 1360; GFX9-NEXT: v_mov_b32_e32 v2, s8 1361; GFX9-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1362; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1363; GFX9-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1364; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1365; GFX9-NEXT: buffer_wbinvl1_vol 1366; GFX9-NEXT: BB6_2: 1367; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 1368; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1369; GFX9-NEXT: v_mul_lo_u32 v3, s3, v0 1370; GFX9-NEXT: v_mul_hi_u32 v4, s2, v0 1371; GFX9-NEXT: v_mul_lo_u32 v0, s2, v0 1372; GFX9-NEXT: s_mov_b32 s4, s0 1373; GFX9-NEXT: v_readfirstlane_b32 s0, v1 1374; GFX9-NEXT: s_mov_b32 s5, s1 1375; GFX9-NEXT: v_readfirstlane_b32 s1, v2 1376; GFX9-NEXT: v_add_u32_e32 v1, v4, v3 1377; GFX9-NEXT: v_mov_b32_e32 v2, s1 1378; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 1379; GFX9-NEXT: s_mov_b32 s7, 0xf000 1380; GFX9-NEXT: s_mov_b32 s6, -1 1381; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc 1382; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1383; GFX9-NEXT: s_endpgm 1384; 1385; GFX1064-LABEL: add_i64_uniform: 1386; GFX1064: ; %bb.0: ; %entry 1387; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1388; GFX1064-NEXT: s_mov_b64 s[6:7], exec 1389; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2 1390; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 1391; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 1392; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1393; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 1394; GFX1064-NEXT: s_cbranch_execz BB6_2 1395; GFX1064-NEXT: ; %bb.1: 1396; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1397; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1398; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1399; GFX1064-NEXT: s_mul_i32 s7, s3, s6 1400; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6 1401; GFX1064-NEXT: s_mul_i32 s6, s2, s6 1402; GFX1064-NEXT: s_add_i32 s8, s8, s7 1403; GFX1064-NEXT: v_mov_b32_e32 v1, s6 1404; GFX1064-NEXT: v_mov_b32_e32 v2, s8 1405; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1406; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1407; GFX1064-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1408; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1409; GFX1064-NEXT: buffer_gl0_inv 1410; GFX1064-NEXT: buffer_gl1_inv 1411; GFX1064-NEXT: BB6_2: 1412; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1413; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 1414; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1415; GFX1064-NEXT: v_mul_lo_u32 v3, s3, v0 1416; GFX1064-NEXT: v_mul_hi_u32 v4, s2, v0 1417; GFX1064-NEXT: v_mul_lo_u32 v0, s2, v0 1418; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 1419; GFX1064-NEXT: v_readfirstlane_b32 s4, v2 1420; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1421; GFX1064-NEXT: v_add_nc_u32_e32 v1, v4, v3 1422; GFX1064-NEXT: v_add_co_u32_e64 v0, vcc, s2, v0 1423; GFX1064-NEXT: s_mov_b32 s2, -1 1424; GFX1064-NEXT: v_add_co_ci_u32_e32 v1, vcc, s4, v1, vcc 1425; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1426; GFX1064-NEXT: s_endpgm 1427; 1428; GFX1032-LABEL: add_i64_uniform: 1429; GFX1032: ; %bb.0: ; %entry 1430; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1431; GFX1032-NEXT: s_mov_b32 s5, exec_lo 1432; GFX1032-NEXT: ; implicit-def: $vcc_hi 1433; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s5, 0 1434; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2 1435; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1436; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 1437; GFX1032-NEXT: s_cbranch_execz BB6_2 1438; GFX1032-NEXT: ; %bb.1: 1439; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 1440; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1441; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1442; GFX1032-NEXT: s_mul_i32 s6, s3, s5 1443; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5 1444; GFX1032-NEXT: s_mul_i32 s5, s2, s5 1445; GFX1032-NEXT: s_add_i32 s7, s7, s6 1446; GFX1032-NEXT: v_mov_b32_e32 v1, s5 1447; GFX1032-NEXT: v_mov_b32_e32 v2, s7 1448; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1449; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1450; GFX1032-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1451; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1452; GFX1032-NEXT: buffer_gl0_inv 1453; GFX1032-NEXT: buffer_gl1_inv 1454; GFX1032-NEXT: BB6_2: 1455; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1456; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 1457; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1458; GFX1032-NEXT: v_mul_lo_u32 v3, s3, v0 1459; GFX1032-NEXT: v_mul_hi_u32 v4, s2, v0 1460; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 1461; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 1462; GFX1032-NEXT: v_readfirstlane_b32 s4, v2 1463; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1464; GFX1032-NEXT: v_add_nc_u32_e32 v1, v4, v3 1465; GFX1032-NEXT: v_add_co_u32_e64 v0, vcc_lo, s2, v0 1466; GFX1032-NEXT: s_mov_b32 s2, -1 1467; GFX1032-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo 1468; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1469; GFX1032-NEXT: s_endpgm 1470entry: 1471 %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %additive acq_rel 1472 store i64 %old, i64 addrspace(1)* %out 1473 ret void 1474} 1475 1476; GCN-NOT: v_mbcnt_lo_u32_b32 1477; GCN-NOT: v_mbcnt_hi_u32_b32 1478; GCN-NOT: s_bcnt1_i32_b64 1479define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out) { 1480; 1481; 1482; GFX7LESS-LABEL: add_i64_varying: 1483; GFX7LESS: ; %bb.0: ; %entry 1484; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1485; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 1486; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 1487; GFX7LESS-NEXT: s_mov_b32 m0, -1 1488; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1489; GFX7LESS-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] 1490; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1491; GFX7LESS-NEXT: buffer_wbinvl1 1492; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1493; GFX7LESS-NEXT: s_mov_b32 s2, -1 1494; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1495; GFX7LESS-NEXT: s_endpgm 1496; 1497; GFX8-LABEL: add_i64_varying: 1498; GFX8: ; %bb.0: ; %entry 1499; GFX8-NEXT: v_mov_b32_e32 v1, 0 1500; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 1501; GFX8-NEXT: s_mov_b32 m0, -1 1502; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1503; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1504; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] 1505; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1506; GFX8-NEXT: buffer_wbinvl1_vol 1507; GFX8-NEXT: s_mov_b32 s3, 0xf000 1508; GFX8-NEXT: s_mov_b32 s2, -1 1509; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1510; GFX8-NEXT: s_endpgm 1511; 1512; GFX9-LABEL: add_i64_varying: 1513; GFX9: ; %bb.0: ; %entry 1514; GFX9-NEXT: v_mov_b32_e32 v1, 0 1515; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 1516; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1517; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1518; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] 1519; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1520; GFX9-NEXT: buffer_wbinvl1_vol 1521; GFX9-NEXT: s_mov_b32 s3, 0xf000 1522; GFX9-NEXT: s_mov_b32 s2, -1 1523; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1524; GFX9-NEXT: s_endpgm 1525; 1526; GFX1064-LABEL: add_i64_varying: 1527; GFX1064: ; %bb.0: ; %entry 1528; GFX1064-NEXT: v_mov_b32_e32 v1, 0 1529; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 1530; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1531; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1532; GFX1064-NEXT: s_mov_b32 s2, -1 1533; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1534; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1535; GFX1064-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] 1536; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1537; GFX1064-NEXT: buffer_gl0_inv 1538; GFX1064-NEXT: buffer_gl1_inv 1539; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1540; GFX1064-NEXT: s_endpgm 1541; 1542; GFX1032-LABEL: add_i64_varying: 1543; GFX1032: ; %bb.0: ; %entry 1544; GFX1032-NEXT: v_mov_b32_e32 v1, 0 1545; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 1546; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1547; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1548; GFX1032-NEXT: s_mov_b32 s2, -1 1549; GFX1032-NEXT: ; implicit-def: $vcc_hi 1550; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1551; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1552; GFX1032-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] 1553; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1554; GFX1032-NEXT: buffer_gl0_inv 1555; GFX1032-NEXT: buffer_gl1_inv 1556; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1557; GFX1032-NEXT: s_endpgm 1558entry: 1559 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1560 %zext = zext i32 %lane to i64 1561 %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %zext acq_rel 1562 store i64 %old, i64 addrspace(1)* %out 1563 ret void 1564} 1565 1566define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) { 1567; 1568; 1569; GFX7LESS-LABEL: sub_i32_constant: 1570; GFX7LESS: ; %bb.0: ; %entry 1571; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 1572; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1573; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1574; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 1575; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1576; GFX7LESS-NEXT: ; implicit-def: $vgpr1 1577; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 1578; GFX7LESS-NEXT: s_cbranch_execz BB8_2 1579; GFX7LESS-NEXT: ; %bb.1: 1580; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1581; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1582; GFX7LESS-NEXT: v_mul_u32_u24_e64 v2, s2, 5 1583; GFX7LESS-NEXT: s_mov_b32 m0, -1 1584; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1585; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 1586; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1587; GFX7LESS-NEXT: buffer_wbinvl1 1588; GFX7LESS-NEXT: BB8_2: 1589; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 1590; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 1591; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1592; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1593; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 1594; GFX7LESS-NEXT: s_mov_b32 s2, -1 1595; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1596; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 1597; GFX7LESS-NEXT: s_endpgm 1598; 1599; GFX8-LABEL: sub_i32_constant: 1600; GFX8: ; %bb.0: ; %entry 1601; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1602; GFX8-NEXT: s_mov_b64 s[2:3], exec 1603; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1604; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1605; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1606; GFX8-NEXT: ; implicit-def: $vgpr1 1607; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 1608; GFX8-NEXT: s_cbranch_execz BB8_2 1609; GFX8-NEXT: ; %bb.1: 1610; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1611; GFX8-NEXT: v_mul_u32_u24_e64 v1, s2, 5 1612; GFX8-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo 1613; GFX8-NEXT: s_mov_b32 m0, -1 1614; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1615; GFX8-NEXT: ds_sub_rtn_u32 v1, v2, v1 1616; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1617; GFX8-NEXT: buffer_wbinvl1_vol 1618; GFX8-NEXT: BB8_2: 1619; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1620; GFX8-NEXT: v_readfirstlane_b32 s2, v1 1621; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1622; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 1623; GFX8-NEXT: s_mov_b32 s3, 0xf000 1624; GFX8-NEXT: s_mov_b32 s2, -1 1625; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1626; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 1627; GFX8-NEXT: s_endpgm 1628; 1629; GFX9-LABEL: sub_i32_constant: 1630; GFX9: ; %bb.0: ; %entry 1631; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1632; GFX9-NEXT: s_mov_b64 s[2:3], exec 1633; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1634; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1635; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1636; GFX9-NEXT: ; implicit-def: $vgpr1 1637; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 1638; GFX9-NEXT: s_cbranch_execz BB8_2 1639; GFX9-NEXT: ; %bb.1: 1640; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1641; GFX9-NEXT: v_mul_u32_u24_e64 v1, s2, 5 1642; GFX9-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo 1643; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1644; GFX9-NEXT: ds_sub_rtn_u32 v1, v2, v1 1645; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1646; GFX9-NEXT: buffer_wbinvl1_vol 1647; GFX9-NEXT: BB8_2: 1648; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 1649; GFX9-NEXT: v_readfirstlane_b32 s2, v1 1650; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1651; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 1652; GFX9-NEXT: s_mov_b32 s3, 0xf000 1653; GFX9-NEXT: s_mov_b32 s2, -1 1654; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1655; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 1656; GFX9-NEXT: s_endpgm 1657; 1658; GFX1064-LABEL: sub_i32_constant: 1659; GFX1064: ; %bb.0: ; %entry 1660; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1661; GFX1064-NEXT: s_mov_b64 s[2:3], exec 1662; GFX1064-NEXT: ; implicit-def: $vgpr1 1663; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1664; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 1665; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1666; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 1667; GFX1064-NEXT: s_cbranch_execz BB8_2 1668; GFX1064-NEXT: ; %bb.1: 1669; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1670; GFX1064-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo 1671; GFX1064-NEXT: v_mul_u32_u24_e64 v1, s2, 5 1672; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1673; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1674; GFX1064-NEXT: ds_sub_rtn_u32 v1, v2, v1 1675; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1676; GFX1064-NEXT: buffer_gl0_inv 1677; GFX1064-NEXT: buffer_gl1_inv 1678; GFX1064-NEXT: BB8_2: 1679; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1680; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 1681; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 1682; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1683; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1684; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1685; GFX1064-NEXT: s_mov_b32 s2, -1 1686; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1687; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 1688; GFX1064-NEXT: s_endpgm 1689; 1690; GFX1032-LABEL: sub_i32_constant: 1691; GFX1032: ; %bb.0: ; %entry 1692; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1693; GFX1032-NEXT: s_mov_b32 s2, exec_lo 1694; GFX1032-NEXT: ; implicit-def: $vcc_hi 1695; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1696; GFX1032-NEXT: ; implicit-def: $vgpr1 1697; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1698; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 1699; GFX1032-NEXT: s_cbranch_execz BB8_2 1700; GFX1032-NEXT: ; %bb.1: 1701; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s2 1702; GFX1032-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo 1703; GFX1032-NEXT: v_mul_u32_u24_e64 v1, s2, 5 1704; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1705; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1706; GFX1032-NEXT: ds_sub_rtn_u32 v1, v2, v1 1707; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1708; GFX1032-NEXT: buffer_gl0_inv 1709; GFX1032-NEXT: buffer_gl1_inv 1710; GFX1032-NEXT: BB8_2: 1711; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1712; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 1713; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 1714; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1715; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1716; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1717; GFX1032-NEXT: s_mov_b32 s2, -1 1718; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1719; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 1720; GFX1032-NEXT: s_endpgm 1721entry: 1722 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 5 acq_rel 1723 store i32 %old, i32 addrspace(1)* %out 1724 ret void 1725} 1726 1727define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive) { 1728; 1729; 1730; GFX7LESS-LABEL: sub_i32_uniform: 1731; GFX7LESS: ; %bb.0: ; %entry 1732; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 1733; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1734; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0xb 1735; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 1736; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 1737; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1738; GFX7LESS-NEXT: ; implicit-def: $vgpr1 1739; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc 1740; GFX7LESS-NEXT: s_cbranch_execz BB9_2 1741; GFX7LESS-NEXT: ; %bb.1: 1742; GFX7LESS-NEXT: s_bcnt1_i32_b64 s3, s[6:7] 1743; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1744; GFX7LESS-NEXT: s_mul_i32 s3, s2, s3 1745; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1746; GFX7LESS-NEXT: v_mov_b32_e32 v2, s3 1747; GFX7LESS-NEXT: s_mov_b32 m0, -1 1748; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1749; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 1750; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1751; GFX7LESS-NEXT: buffer_wbinvl1 1752; GFX7LESS-NEXT: BB9_2: 1753; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] 1754; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 1755; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1756; GFX7LESS-NEXT: v_mul_lo_u32 v0, s2, v0 1757; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 1758; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 1759; GFX7LESS-NEXT: s_mov_b32 s6, -1 1760; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 1761; GFX7LESS-NEXT: s_endpgm 1762; 1763; GFX8-LABEL: sub_i32_uniform: 1764; GFX8: ; %bb.0: ; %entry 1765; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1766; GFX8-NEXT: s_load_dword s0, s[0:1], 0x2c 1767; GFX8-NEXT: s_mov_b64 s[2:3], exec 1768; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1769; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1770; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1771; GFX8-NEXT: ; implicit-def: $vgpr1 1772; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc 1773; GFX8-NEXT: s_cbranch_execz BB9_2 1774; GFX8-NEXT: ; %bb.1: 1775; GFX8-NEXT: s_bcnt1_i32_b64 s1, s[2:3] 1776; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1777; GFX8-NEXT: s_mul_i32 s1, s0, s1 1778; GFX8-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1779; GFX8-NEXT: v_mov_b32_e32 v2, s1 1780; GFX8-NEXT: s_mov_b32 m0, -1 1781; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1782; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2 1783; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1784; GFX8-NEXT: buffer_wbinvl1_vol 1785; GFX8-NEXT: BB9_2: 1786; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 1787; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1788; GFX8-NEXT: v_mul_lo_u32 v0, s0, v0 1789; GFX8-NEXT: v_readfirstlane_b32 s0, v1 1790; GFX8-NEXT: s_mov_b32 s7, 0xf000 1791; GFX8-NEXT: s_mov_b32 s6, -1 1792; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 1793; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 1794; GFX8-NEXT: s_endpgm 1795; 1796; GFX9-LABEL: sub_i32_uniform: 1797; GFX9: ; %bb.0: ; %entry 1798; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1799; GFX9-NEXT: s_load_dword s0, s[0:1], 0x2c 1800; GFX9-NEXT: s_mov_b64 s[2:3], exec 1801; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1802; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1803; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1804; GFX9-NEXT: ; implicit-def: $vgpr1 1805; GFX9-NEXT: s_and_saveexec_b64 s[6:7], vcc 1806; GFX9-NEXT: s_cbranch_execz BB9_2 1807; GFX9-NEXT: ; %bb.1: 1808; GFX9-NEXT: s_bcnt1_i32_b64 s1, s[2:3] 1809; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1810; GFX9-NEXT: s_mul_i32 s1, s0, s1 1811; GFX9-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1812; GFX9-NEXT: v_mov_b32_e32 v2, s1 1813; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1814; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 1815; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1816; GFX9-NEXT: buffer_wbinvl1_vol 1817; GFX9-NEXT: BB9_2: 1818; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] 1819; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1820; GFX9-NEXT: v_mul_lo_u32 v0, s0, v0 1821; GFX9-NEXT: v_readfirstlane_b32 s0, v1 1822; GFX9-NEXT: s_mov_b32 s7, 0xf000 1823; GFX9-NEXT: s_mov_b32 s6, -1 1824; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 1825; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 1826; GFX9-NEXT: s_endpgm 1827; 1828; GFX1064-LABEL: sub_i32_uniform: 1829; GFX1064: ; %bb.0: ; %entry 1830; GFX1064-NEXT: s_clause 0x1 1831; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1832; GFX1064-NEXT: s_load_dword s0, s[0:1], 0x2c 1833; GFX1064-NEXT: s_mov_b64 s[2:3], exec 1834; GFX1064-NEXT: ; implicit-def: $vgpr1 1835; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1836; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 1837; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1838; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], vcc 1839; GFX1064-NEXT: s_cbranch_execz BB9_2 1840; GFX1064-NEXT: ; %bb.1: 1841; GFX1064-NEXT: s_bcnt1_i32_b64 s1, s[2:3] 1842; GFX1064-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1843; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1844; GFX1064-NEXT: s_mul_i32 s1, s0, s1 1845; GFX1064-NEXT: v_mov_b32_e32 v2, s1 1846; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1847; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1848; GFX1064-NEXT: ds_sub_rtn_u32 v1, v1, v2 1849; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1850; GFX1064-NEXT: buffer_gl0_inv 1851; GFX1064-NEXT: buffer_gl1_inv 1852; GFX1064-NEXT: BB9_2: 1853; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1854; GFX1064-NEXT: s_or_b64 exec, exec, s[6:7] 1855; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1856; GFX1064-NEXT: v_mul_lo_u32 v0, s0, v0 1857; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 1858; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 1859; GFX1064-NEXT: s_mov_b32 s6, -1 1860; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1861; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0 1862; GFX1064-NEXT: s_endpgm 1863; 1864; GFX1032-LABEL: sub_i32_uniform: 1865; GFX1032: ; %bb.0: ; %entry 1866; GFX1032-NEXT: s_clause 0x1 1867; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1868; GFX1032-NEXT: s_load_dword s0, s[0:1], 0x2c 1869; GFX1032-NEXT: s_mov_b32 s2, exec_lo 1870; GFX1032-NEXT: ; implicit-def: $vcc_hi 1871; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1872; GFX1032-NEXT: ; implicit-def: $vgpr1 1873; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1874; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo 1875; GFX1032-NEXT: s_cbranch_execz BB9_2 1876; GFX1032-NEXT: ; %bb.1: 1877; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s2 1878; GFX1032-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1879; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1880; GFX1032-NEXT: s_mul_i32 s2, s0, s2 1881; GFX1032-NEXT: v_mov_b32_e32 v2, s2 1882; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1883; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1884; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2 1885; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1886; GFX1032-NEXT: buffer_gl0_inv 1887; GFX1032-NEXT: buffer_gl1_inv 1888; GFX1032-NEXT: BB9_2: 1889; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1890; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 1891; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1892; GFX1032-NEXT: v_mul_lo_u32 v0, s0, v0 1893; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 1894; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 1895; GFX1032-NEXT: s_mov_b32 s6, -1 1896; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1897; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 1898; GFX1032-NEXT: s_endpgm 1899entry: 1900 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %subitive acq_rel 1901 store i32 %old, i32 addrspace(1)* %out 1902 ret void 1903} 1904 1905define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { 1906; 1907; 1908; GFX7LESS-LABEL: sub_i32_varying: 1909; GFX7LESS: ; %bb.0: ; %entry 1910; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1911; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1912; GFX7LESS-NEXT: s_mov_b32 m0, -1 1913; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1914; GFX7LESS-NEXT: ds_sub_rtn_u32 v0, v1, v0 1915; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1916; GFX7LESS-NEXT: buffer_wbinvl1 1917; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1918; GFX7LESS-NEXT: s_mov_b32 s2, -1 1919; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 1920; GFX7LESS-NEXT: s_endpgm 1921; 1922; GFX8-LABEL: sub_i32_varying: 1923; GFX8: ; %bb.0: ; %entry 1924; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1925; GFX8-NEXT: v_mov_b32_e32 v2, v0 1926; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 1927; GFX8-NEXT: v_mov_b32_e32 v1, 0 1928; GFX8-NEXT: s_mov_b64 exec, s[2:3] 1929; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1930; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 1931; GFX8-NEXT: s_not_b64 exec, exec 1932; GFX8-NEXT: v_mov_b32_e32 v2, 0 1933; GFX8-NEXT: s_not_b64 exec, exec 1934; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 1935; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 1936; GFX8-NEXT: s_nop 1 1937; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 1938; GFX8-NEXT: s_nop 1 1939; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 1940; GFX8-NEXT: s_nop 1 1941; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 1942; GFX8-NEXT: s_nop 1 1943; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 1944; GFX8-NEXT: s_nop 1 1945; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 1946; GFX8-NEXT: v_readlane_b32 s4, v2, 63 1947; GFX8-NEXT: s_nop 0 1948; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 1949; GFX8-NEXT: s_mov_b64 exec, s[2:3] 1950; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1951; GFX8-NEXT: ; implicit-def: $vgpr0 1952; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 1953; GFX8-NEXT: s_cbranch_execz BB10_2 1954; GFX8-NEXT: ; %bb.1: 1955; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 1956; GFX8-NEXT: v_mov_b32_e32 v3, s4 1957; GFX8-NEXT: s_mov_b32 m0, -1 1958; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1959; GFX8-NEXT: ds_sub_rtn_u32 v0, v0, v3 1960; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1961; GFX8-NEXT: buffer_wbinvl1_vol 1962; GFX8-NEXT: BB10_2: 1963; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 1964; GFX8-NEXT: v_readfirstlane_b32 s2, v0 1965; GFX8-NEXT: v_mov_b32_e32 v0, v1 1966; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 1967; GFX8-NEXT: s_mov_b32 s3, 0xf000 1968; GFX8-NEXT: s_mov_b32 s2, -1 1969; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1970; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 1971; GFX8-NEXT: s_endpgm 1972; 1973; GFX9-LABEL: sub_i32_varying: 1974; GFX9: ; %bb.0: ; %entry 1975; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1976; GFX9-NEXT: v_mov_b32_e32 v2, v0 1977; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 1978; GFX9-NEXT: v_mov_b32_e32 v1, 0 1979; GFX9-NEXT: s_mov_b64 exec, s[2:3] 1980; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1981; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 1982; GFX9-NEXT: s_not_b64 exec, exec 1983; GFX9-NEXT: v_mov_b32_e32 v2, 0 1984; GFX9-NEXT: s_not_b64 exec, exec 1985; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 1986; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 1987; GFX9-NEXT: s_nop 1 1988; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 1989; GFX9-NEXT: s_nop 1 1990; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 1991; GFX9-NEXT: s_nop 1 1992; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 1993; GFX9-NEXT: s_nop 1 1994; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 1995; GFX9-NEXT: s_nop 1 1996; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 1997; GFX9-NEXT: v_readlane_b32 s4, v2, 63 1998; GFX9-NEXT: s_nop 0 1999; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2000; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2001; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2002; GFX9-NEXT: ; implicit-def: $vgpr0 2003; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 2004; GFX9-NEXT: s_cbranch_execz BB10_2 2005; GFX9-NEXT: ; %bb.1: 2006; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 2007; GFX9-NEXT: v_mov_b32_e32 v3, s4 2008; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2009; GFX9-NEXT: ds_sub_rtn_u32 v0, v0, v3 2010; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2011; GFX9-NEXT: buffer_wbinvl1_vol 2012; GFX9-NEXT: BB10_2: 2013; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 2014; GFX9-NEXT: v_readfirstlane_b32 s2, v0 2015; GFX9-NEXT: v_mov_b32_e32 v0, v1 2016; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 2017; GFX9-NEXT: s_mov_b32 s3, 0xf000 2018; GFX9-NEXT: s_mov_b32 s2, -1 2019; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2020; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 2021; GFX9-NEXT: s_endpgm 2022; 2023; GFX1064-LABEL: sub_i32_varying: 2024; GFX1064: ; %bb.0: ; %entry 2025; GFX1064-NEXT: v_mov_b32_e32 v1, v0 2026; GFX1064-NEXT: s_not_b64 exec, exec 2027; GFX1064-NEXT: v_mov_b32_e32 v1, 0 2028; GFX1064-NEXT: s_not_b64 exec, exec 2029; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2030; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 2031; GFX1064-NEXT: v_mov_b32_e32 v3, 0 2032; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 2033; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 2034; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 2035; GFX1064-NEXT: v_mov_b32_e32 v2, v1 2036; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2037; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2038; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 2039; GFX1064-NEXT: v_mov_b32_e32 v2, s4 2040; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 2041; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 2042; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2043; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2044; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2045; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2046; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 2047; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 2048; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2049; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 2050; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2051; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 2052; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 2053; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 2054; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2055; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 2056; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 2057; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 2058; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 2059; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2060; GFX1064-NEXT: s_mov_b32 s2, -1 2061; GFX1064-NEXT: ; implicit-def: $vgpr0 2062; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 2063; GFX1064-NEXT: s_cbranch_execz BB10_2 2064; GFX1064-NEXT: ; %bb.1: 2065; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 2066; GFX1064-NEXT: v_mov_b32_e32 v4, s7 2067; GFX1064-NEXT: s_mov_b32 s3, s7 2068; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2069; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2070; GFX1064-NEXT: ds_sub_rtn_u32 v0, v7, v4 2071; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2072; GFX1064-NEXT: buffer_gl0_inv 2073; GFX1064-NEXT: buffer_gl1_inv 2074; GFX1064-NEXT: BB10_2: 2075; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2076; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 2077; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 2078; GFX1064-NEXT: v_mov_b32_e32 v0, v3 2079; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s3, v0 2080; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2081; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2082; GFX1064-NEXT: s_nop 0 2083; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 2084; GFX1064-NEXT: s_endpgm 2085; 2086; GFX1032-LABEL: sub_i32_varying: 2087; GFX1032: ; %bb.0: ; %entry 2088; GFX1032-NEXT: v_mov_b32_e32 v1, v0 2089; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2090; GFX1032-NEXT: v_mov_b32_e32 v1, 0 2091; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2092; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2093; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 2094; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 2095; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 2096; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 2097; GFX1032-NEXT: v_mov_b32_e32 v2, v1 2098; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2099; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2100; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2101; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2102; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2103; GFX1032-NEXT: v_mov_b32_e32 v3, 0 2104; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 2105; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 2106; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2107; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2108; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 2109; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2110; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 2111; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2112; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2113; GFX1032-NEXT: s_mov_b32 s2, -1 2114; GFX1032-NEXT: ; implicit-def: $vgpr0 2115; GFX1032-NEXT: ; implicit-def: $vcc_hi 2116; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 2117; GFX1032-NEXT: s_cbranch_execz BB10_2 2118; GFX1032-NEXT: ; %bb.1: 2119; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 2120; GFX1032-NEXT: v_mov_b32_e32 v4, s4 2121; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2122; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2123; GFX1032-NEXT: ds_sub_rtn_u32 v0, v7, v4 2124; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2125; GFX1032-NEXT: buffer_gl0_inv 2126; GFX1032-NEXT: buffer_gl1_inv 2127; GFX1032-NEXT: BB10_2: 2128; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 2129; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 2130; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 2131; GFX1032-NEXT: v_mov_b32_e32 v0, v3 2132; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s3, v0 2133; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2134; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2135; GFX1032-NEXT: s_nop 0 2136; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 2137; GFX1032-NEXT: s_endpgm 2138entry: 2139 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2140 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %lane acq_rel 2141 store i32 %old, i32 addrspace(1)* %out 2142 ret void 2143} 2144 2145define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { 2146; 2147; 2148; GFX7LESS-LABEL: sub_i64_constant: 2149; GFX7LESS: ; %bb.0: ; %entry 2150; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec 2151; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2152; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 2153; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 2154; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2155; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 2156; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 2157; GFX7LESS-NEXT: s_cbranch_execz BB11_2 2158; GFX7LESS-NEXT: ; %bb.1: 2159; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2160; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2161; GFX7LESS-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 2162; GFX7LESS-NEXT: v_mul_u32_u24_e64 v1, s4, 5 2163; GFX7LESS-NEXT: s_mov_b32 m0, -1 2164; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2165; GFX7LESS-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2166; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2167; GFX7LESS-NEXT: buffer_wbinvl1 2168; GFX7LESS-NEXT: BB11_2: 2169; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 2170; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 2171; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v2 2172; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 2173; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0 2174; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2175; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 2176; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 2177; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 2178; GFX7LESS-NEXT: s_mov_b32 s2, -1 2179; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2180; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2181; GFX7LESS-NEXT: s_endpgm 2182; 2183; GFX8-LABEL: sub_i64_constant: 2184; GFX8: ; %bb.0: ; %entry 2185; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2186; GFX8-NEXT: s_mov_b64 s[4:5], exec 2187; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 2188; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 2189; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2190; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 2191; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 2192; GFX8-NEXT: s_cbranch_execz BB11_2 2193; GFX8-NEXT: ; %bb.1: 2194; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2195; GFX8-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 2196; GFX8-NEXT: v_mul_u32_u24_e64 v1, s4, 5 2197; GFX8-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2198; GFX8-NEXT: s_mov_b32 m0, -1 2199; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2200; GFX8-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2201; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2202; GFX8-NEXT: buffer_wbinvl1_vol 2203; GFX8-NEXT: BB11_2: 2204; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 2205; GFX8-NEXT: v_readfirstlane_b32 s3, v2 2206; GFX8-NEXT: v_readfirstlane_b32 s2, v1 2207; GFX8-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 2208; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 2209; GFX8-NEXT: v_mov_b32_e32 v2, s3 2210; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 2211; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 2212; GFX8-NEXT: s_mov_b32 s3, 0xf000 2213; GFX8-NEXT: s_mov_b32 s2, -1 2214; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2215; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2216; GFX8-NEXT: s_endpgm 2217; 2218; GFX9-LABEL: sub_i64_constant: 2219; GFX9: ; %bb.0: ; %entry 2220; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2221; GFX9-NEXT: s_mov_b64 s[4:5], exec 2222; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 2223; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 2224; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2225; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 2226; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 2227; GFX9-NEXT: s_cbranch_execz BB11_2 2228; GFX9-NEXT: ; %bb.1: 2229; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2230; GFX9-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 2231; GFX9-NEXT: v_mul_u32_u24_e64 v1, s4, 5 2232; GFX9-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2233; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2234; GFX9-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2235; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2236; GFX9-NEXT: buffer_wbinvl1_vol 2237; GFX9-NEXT: BB11_2: 2238; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 2239; GFX9-NEXT: v_readfirstlane_b32 s3, v2 2240; GFX9-NEXT: v_readfirstlane_b32 s2, v1 2241; GFX9-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 2242; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 2243; GFX9-NEXT: v_mov_b32_e32 v2, s3 2244; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s2, v0 2245; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc 2246; GFX9-NEXT: s_mov_b32 s3, 0xf000 2247; GFX9-NEXT: s_mov_b32 s2, -1 2248; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2249; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2250; GFX9-NEXT: s_endpgm 2251; 2252; GFX1064-LABEL: sub_i64_constant: 2253; GFX1064: ; %bb.0: ; %entry 2254; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2255; GFX1064-NEXT: s_mov_b64 s[4:5], exec 2256; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2 2257; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 2258; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s5, v0 2259; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2260; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 2261; GFX1064-NEXT: s_cbranch_execz BB11_2 2262; GFX1064-NEXT: ; %bb.1: 2263; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2264; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2265; GFX1064-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 2266; GFX1064-NEXT: v_mul_u32_u24_e64 v1, s4, 5 2267; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2268; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2269; GFX1064-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2270; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2271; GFX1064-NEXT: buffer_gl0_inv 2272; GFX1064-NEXT: buffer_gl1_inv 2273; GFX1064-NEXT: BB11_2: 2274; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2275; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 2276; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 2277; GFX1064-NEXT: v_mul_u32_u24_e32 v1, 5, v0 2278; GFX1064-NEXT: v_readfirstlane_b32 s3, v2 2279; GFX1064-NEXT: v_mul_hi_u32_u24_e32 v2, 5, v0 2280; GFX1064-NEXT: v_sub_co_u32_e64 v0, vcc, s2, v1 2281; GFX1064-NEXT: s_mov_b32 s2, -1 2282; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc 2283; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2284; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2285; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2286; GFX1064-NEXT: s_endpgm 2287; 2288; GFX1032-LABEL: sub_i64_constant: 2289; GFX1032: ; %bb.0: ; %entry 2290; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2291; GFX1032-NEXT: s_mov_b32 s3, exec_lo 2292; GFX1032-NEXT: ; implicit-def: $vcc_hi 2293; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 2294; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2 2295; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2296; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 2297; GFX1032-NEXT: s_cbranch_execz BB11_2 2298; GFX1032-NEXT: ; %bb.1: 2299; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 2300; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2301; GFX1032-NEXT: v_mul_hi_u32_u24_e64 v2, s3, 5 2302; GFX1032-NEXT: v_mul_u32_u24_e64 v1, s3, 5 2303; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2304; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2305; GFX1032-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2306; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2307; GFX1032-NEXT: buffer_gl0_inv 2308; GFX1032-NEXT: buffer_gl1_inv 2309; GFX1032-NEXT: BB11_2: 2310; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 2311; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 2312; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 2313; GFX1032-NEXT: v_mul_u32_u24_e32 v1, 5, v0 2314; GFX1032-NEXT: v_readfirstlane_b32 s3, v2 2315; GFX1032-NEXT: v_mul_hi_u32_u24_e32 v2, 5, v0 2316; GFX1032-NEXT: v_sub_co_u32_e64 v0, vcc_lo, s2, v1 2317; GFX1032-NEXT: s_mov_b32 s2, -1 2318; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo 2319; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2320; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2321; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2322; GFX1032-NEXT: s_endpgm 2323entry: 2324 %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 5 acq_rel 2325 store i64 %old, i64 addrspace(1)* %out 2326 ret void 2327} 2328 2329define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive) { 2330; 2331; 2332; GFX7LESS-LABEL: sub_i64_uniform: 2333; GFX7LESS: ; %bb.0: ; %entry 2334; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 2335; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 2336; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 2337; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 2338; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2339; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 2340; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 2341; GFX7LESS-NEXT: s_cbranch_execz BB12_2 2342; GFX7LESS-NEXT: ; %bb.1: 2343; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 2344; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2345; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2346; GFX7LESS-NEXT: s_mul_i32 s7, s3, s6 2347; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 2348; GFX7LESS-NEXT: v_mul_hi_u32 v1, s2, v1 2349; GFX7LESS-NEXT: s_mul_i32 s6, s2, s6 2350; GFX7LESS-NEXT: v_add_i32_e32 v2, vcc, s7, v1 2351; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 2352; GFX7LESS-NEXT: s_mov_b32 m0, -1 2353; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2354; GFX7LESS-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2355; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2356; GFX7LESS-NEXT: buffer_wbinvl1 2357; GFX7LESS-NEXT: BB12_2: 2358; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 2359; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 2360; GFX7LESS-NEXT: s_mov_b32 s6, -1 2361; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2362; GFX7LESS-NEXT: s_mov_b32 s4, s0 2363; GFX7LESS-NEXT: s_mov_b32 s5, s1 2364; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 2365; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v2 2366; GFX7LESS-NEXT: v_mul_lo_u32 v1, s3, v0 2367; GFX7LESS-NEXT: v_mul_hi_u32 v2, s2, v0 2368; GFX7LESS-NEXT: v_mul_lo_u32 v0, s2, v0 2369; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v2, v1 2370; GFX7LESS-NEXT: v_mov_b32_e32 v2, s1 2371; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 2372; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 2373; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2374; GFX7LESS-NEXT: s_endpgm 2375; 2376; GFX8-LABEL: sub_i64_uniform: 2377; GFX8: ; %bb.0: ; %entry 2378; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2379; GFX8-NEXT: s_mov_b64 s[6:7], exec 2380; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 2381; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 2382; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2383; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 2384; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 2385; GFX8-NEXT: s_cbranch_execz BB12_2 2386; GFX8-NEXT: ; %bb.1: 2387; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 2388; GFX8-NEXT: v_mov_b32_e32 v1, s6 2389; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2390; GFX8-NEXT: v_mul_hi_u32 v1, s2, v1 2391; GFX8-NEXT: s_mul_i32 s7, s3, s6 2392; GFX8-NEXT: s_mul_i32 s6, s2, s6 2393; GFX8-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2394; GFX8-NEXT: v_add_u32_e32 v2, vcc, s7, v1 2395; GFX8-NEXT: v_mov_b32_e32 v1, s6 2396; GFX8-NEXT: s_mov_b32 m0, -1 2397; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2398; GFX8-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2399; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2400; GFX8-NEXT: buffer_wbinvl1_vol 2401; GFX8-NEXT: BB12_2: 2402; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 2403; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2404; GFX8-NEXT: s_mov_b32 s4, s0 2405; GFX8-NEXT: v_readfirstlane_b32 s0, v1 2406; GFX8-NEXT: v_mul_lo_u32 v1, s3, v0 2407; GFX8-NEXT: v_mul_hi_u32 v3, s2, v0 2408; GFX8-NEXT: v_mul_lo_u32 v0, s2, v0 2409; GFX8-NEXT: s_mov_b32 s5, s1 2410; GFX8-NEXT: v_readfirstlane_b32 s1, v2 2411; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 2412; GFX8-NEXT: v_mov_b32_e32 v2, s1 2413; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 2414; GFX8-NEXT: s_mov_b32 s7, 0xf000 2415; GFX8-NEXT: s_mov_b32 s6, -1 2416; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 2417; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2418; GFX8-NEXT: s_endpgm 2419; 2420; GFX9-LABEL: sub_i64_uniform: 2421; GFX9: ; %bb.0: ; %entry 2422; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2423; GFX9-NEXT: s_mov_b64 s[6:7], exec 2424; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 2425; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 2426; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2427; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 2428; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 2429; GFX9-NEXT: s_cbranch_execz BB12_2 2430; GFX9-NEXT: ; %bb.1: 2431; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 2432; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2433; GFX9-NEXT: s_mul_i32 s7, s3, s6 2434; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 2435; GFX9-NEXT: s_add_i32 s8, s8, s7 2436; GFX9-NEXT: s_mul_i32 s6, s2, s6 2437; GFX9-NEXT: v_mov_b32_e32 v1, s6 2438; GFX9-NEXT: v_mov_b32_e32 v2, s8 2439; GFX9-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2440; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2441; GFX9-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2442; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2443; GFX9-NEXT: buffer_wbinvl1_vol 2444; GFX9-NEXT: BB12_2: 2445; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 2446; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2447; GFX9-NEXT: v_mul_lo_u32 v3, s3, v0 2448; GFX9-NEXT: v_mul_hi_u32 v4, s2, v0 2449; GFX9-NEXT: v_mul_lo_u32 v0, s2, v0 2450; GFX9-NEXT: s_mov_b32 s4, s0 2451; GFX9-NEXT: v_readfirstlane_b32 s0, v1 2452; GFX9-NEXT: s_mov_b32 s5, s1 2453; GFX9-NEXT: v_readfirstlane_b32 s1, v2 2454; GFX9-NEXT: v_add_u32_e32 v1, v4, v3 2455; GFX9-NEXT: v_mov_b32_e32 v2, s1 2456; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v0 2457; GFX9-NEXT: s_mov_b32 s7, 0xf000 2458; GFX9-NEXT: s_mov_b32 s6, -1 2459; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc 2460; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2461; GFX9-NEXT: s_endpgm 2462; 2463; GFX1064-LABEL: sub_i64_uniform: 2464; GFX1064: ; %bb.0: ; %entry 2465; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2466; GFX1064-NEXT: s_mov_b64 s[6:7], exec 2467; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2 2468; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 2469; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 2470; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2471; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 2472; GFX1064-NEXT: s_cbranch_execz BB12_2 2473; GFX1064-NEXT: ; %bb.1: 2474; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 2475; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2476; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2477; GFX1064-NEXT: s_mul_i32 s7, s3, s6 2478; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6 2479; GFX1064-NEXT: s_mul_i32 s6, s2, s6 2480; GFX1064-NEXT: s_add_i32 s8, s8, s7 2481; GFX1064-NEXT: v_mov_b32_e32 v1, s6 2482; GFX1064-NEXT: v_mov_b32_e32 v2, s8 2483; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2484; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2485; GFX1064-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2486; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2487; GFX1064-NEXT: buffer_gl0_inv 2488; GFX1064-NEXT: buffer_gl1_inv 2489; GFX1064-NEXT: BB12_2: 2490; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2491; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 2492; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2493; GFX1064-NEXT: v_mul_lo_u32 v3, s3, v0 2494; GFX1064-NEXT: v_mul_hi_u32 v4, s2, v0 2495; GFX1064-NEXT: v_mul_lo_u32 v0, s2, v0 2496; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 2497; GFX1064-NEXT: v_readfirstlane_b32 s4, v2 2498; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2499; GFX1064-NEXT: v_add_nc_u32_e32 v1, v4, v3 2500; GFX1064-NEXT: v_sub_co_u32_e64 v0, vcc, s2, v0 2501; GFX1064-NEXT: s_mov_b32 s2, -1 2502; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc 2503; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2504; GFX1064-NEXT: s_endpgm 2505; 2506; GFX1032-LABEL: sub_i64_uniform: 2507; GFX1032: ; %bb.0: ; %entry 2508; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2509; GFX1032-NEXT: s_mov_b32 s5, exec_lo 2510; GFX1032-NEXT: ; implicit-def: $vcc_hi 2511; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s5, 0 2512; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2 2513; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2514; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 2515; GFX1032-NEXT: s_cbranch_execz BB12_2 2516; GFX1032-NEXT: ; %bb.1: 2517; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 2518; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2519; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2520; GFX1032-NEXT: s_mul_i32 s6, s3, s5 2521; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5 2522; GFX1032-NEXT: s_mul_i32 s5, s2, s5 2523; GFX1032-NEXT: s_add_i32 s7, s7, s6 2524; GFX1032-NEXT: v_mov_b32_e32 v1, s5 2525; GFX1032-NEXT: v_mov_b32_e32 v2, s7 2526; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2527; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2528; GFX1032-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2529; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2530; GFX1032-NEXT: buffer_gl0_inv 2531; GFX1032-NEXT: buffer_gl1_inv 2532; GFX1032-NEXT: BB12_2: 2533; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 2534; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 2535; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2536; GFX1032-NEXT: v_mul_lo_u32 v3, s3, v0 2537; GFX1032-NEXT: v_mul_hi_u32 v4, s2, v0 2538; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 2539; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 2540; GFX1032-NEXT: v_readfirstlane_b32 s4, v2 2541; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2542; GFX1032-NEXT: v_add_nc_u32_e32 v1, v4, v3 2543; GFX1032-NEXT: v_sub_co_u32_e64 v0, vcc_lo, s2, v0 2544; GFX1032-NEXT: s_mov_b32 s2, -1 2545; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo 2546; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2547; GFX1032-NEXT: s_endpgm 2548entry: 2549 %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %subitive acq_rel 2550 store i64 %old, i64 addrspace(1)* %out 2551 ret void 2552} 2553 2554; GCN-NOT: v_mbcnt_lo_u32_b32 2555; GCN-NOT: v_mbcnt_hi_u32_b32 2556; GCN-NOT: s_bcnt1_i32_b64 2557define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out) { 2558; 2559; 2560; GFX7LESS-LABEL: sub_i64_varying: 2561; GFX7LESS: ; %bb.0: ; %entry 2562; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2563; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 2564; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 2565; GFX7LESS-NEXT: s_mov_b32 m0, -1 2566; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2567; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] 2568; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2569; GFX7LESS-NEXT: buffer_wbinvl1 2570; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2571; GFX7LESS-NEXT: s_mov_b32 s2, -1 2572; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2573; GFX7LESS-NEXT: s_endpgm 2574; 2575; GFX8-LABEL: sub_i64_varying: 2576; GFX8: ; %bb.0: ; %entry 2577; GFX8-NEXT: v_mov_b32_e32 v1, 0 2578; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 2579; GFX8-NEXT: s_mov_b32 m0, -1 2580; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2581; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2582; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] 2583; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2584; GFX8-NEXT: buffer_wbinvl1_vol 2585; GFX8-NEXT: s_mov_b32 s3, 0xf000 2586; GFX8-NEXT: s_mov_b32 s2, -1 2587; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2588; GFX8-NEXT: s_endpgm 2589; 2590; GFX9-LABEL: sub_i64_varying: 2591; GFX9: ; %bb.0: ; %entry 2592; GFX9-NEXT: v_mov_b32_e32 v1, 0 2593; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 2594; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2595; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2596; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] 2597; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2598; GFX9-NEXT: buffer_wbinvl1_vol 2599; GFX9-NEXT: s_mov_b32 s3, 0xf000 2600; GFX9-NEXT: s_mov_b32 s2, -1 2601; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2602; GFX9-NEXT: s_endpgm 2603; 2604; GFX1064-LABEL: sub_i64_varying: 2605; GFX1064: ; %bb.0: ; %entry 2606; GFX1064-NEXT: v_mov_b32_e32 v1, 0 2607; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 2608; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2609; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2610; GFX1064-NEXT: s_mov_b32 s2, -1 2611; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2612; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2613; GFX1064-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] 2614; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2615; GFX1064-NEXT: buffer_gl0_inv 2616; GFX1064-NEXT: buffer_gl1_inv 2617; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2618; GFX1064-NEXT: s_endpgm 2619; 2620; GFX1032-LABEL: sub_i64_varying: 2621; GFX1032: ; %bb.0: ; %entry 2622; GFX1032-NEXT: v_mov_b32_e32 v1, 0 2623; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 2624; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2625; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2626; GFX1032-NEXT: s_mov_b32 s2, -1 2627; GFX1032-NEXT: ; implicit-def: $vcc_hi 2628; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2629; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2630; GFX1032-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] 2631; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2632; GFX1032-NEXT: buffer_gl0_inv 2633; GFX1032-NEXT: buffer_gl1_inv 2634; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2635; GFX1032-NEXT: s_endpgm 2636entry: 2637 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2638 %zext = zext i32 %lane to i64 2639 %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %zext acq_rel 2640 store i64 %old, i64 addrspace(1)* %out 2641 ret void 2642} 2643 2644define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { 2645; 2646; 2647; GFX7LESS-LABEL: and_i32_varying: 2648; GFX7LESS: ; %bb.0: ; %entry 2649; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2650; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 2651; GFX7LESS-NEXT: s_mov_b32 m0, -1 2652; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2653; GFX7LESS-NEXT: ds_and_rtn_b32 v0, v1, v0 2654; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2655; GFX7LESS-NEXT: buffer_wbinvl1 2656; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2657; GFX7LESS-NEXT: s_mov_b32 s2, -1 2658; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 2659; GFX7LESS-NEXT: s_endpgm 2660; 2661; GFX8-LABEL: and_i32_varying: 2662; GFX8: ; %bb.0: ; %entry 2663; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2664; GFX8-NEXT: v_mov_b32_e32 v2, v0 2665; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2666; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2667; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2668; GFX8-NEXT: v_mov_b32_e32 v1, -1 2669; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2670; GFX8-NEXT: s_not_b64 exec, exec 2671; GFX8-NEXT: v_mov_b32_e32 v2, -1 2672; GFX8-NEXT: s_not_b64 exec, exec 2673; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2674; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 2675; GFX8-NEXT: s_nop 1 2676; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 2677; GFX8-NEXT: s_nop 1 2678; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 2679; GFX8-NEXT: s_nop 1 2680; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 2681; GFX8-NEXT: s_nop 1 2682; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2683; GFX8-NEXT: s_nop 1 2684; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2685; GFX8-NEXT: v_readlane_b32 s4, v2, 63 2686; GFX8-NEXT: s_nop 0 2687; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2688; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2689; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2690; GFX8-NEXT: ; implicit-def: $vgpr0 2691; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 2692; GFX8-NEXT: s_cbranch_execz BB14_2 2693; GFX8-NEXT: ; %bb.1: 2694; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 2695; GFX8-NEXT: v_mov_b32_e32 v3, s4 2696; GFX8-NEXT: s_mov_b32 m0, -1 2697; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2698; GFX8-NEXT: ds_and_rtn_b32 v0, v0, v3 2699; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2700; GFX8-NEXT: buffer_wbinvl1_vol 2701; GFX8-NEXT: BB14_2: 2702; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 2703; GFX8-NEXT: v_readfirstlane_b32 s2, v0 2704; GFX8-NEXT: v_mov_b32_e32 v0, v1 2705; GFX8-NEXT: v_and_b32_e32 v0, s2, v0 2706; GFX8-NEXT: s_mov_b32 s3, 0xf000 2707; GFX8-NEXT: s_mov_b32 s2, -1 2708; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2709; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 2710; GFX8-NEXT: s_endpgm 2711; 2712; GFX9-LABEL: and_i32_varying: 2713; GFX9: ; %bb.0: ; %entry 2714; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2715; GFX9-NEXT: v_mov_b32_e32 v2, v0 2716; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2717; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2718; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2719; GFX9-NEXT: v_mov_b32_e32 v1, -1 2720; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2721; GFX9-NEXT: s_not_b64 exec, exec 2722; GFX9-NEXT: v_mov_b32_e32 v2, -1 2723; GFX9-NEXT: s_not_b64 exec, exec 2724; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2725; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 2726; GFX9-NEXT: s_nop 1 2727; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 2728; GFX9-NEXT: s_nop 1 2729; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 2730; GFX9-NEXT: s_nop 1 2731; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 2732; GFX9-NEXT: s_nop 1 2733; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2734; GFX9-NEXT: s_nop 1 2735; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2736; GFX9-NEXT: v_readlane_b32 s4, v2, 63 2737; GFX9-NEXT: s_nop 0 2738; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2739; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2740; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2741; GFX9-NEXT: ; implicit-def: $vgpr0 2742; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 2743; GFX9-NEXT: s_cbranch_execz BB14_2 2744; GFX9-NEXT: ; %bb.1: 2745; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 2746; GFX9-NEXT: v_mov_b32_e32 v3, s4 2747; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2748; GFX9-NEXT: ds_and_rtn_b32 v0, v0, v3 2749; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2750; GFX9-NEXT: buffer_wbinvl1_vol 2751; GFX9-NEXT: BB14_2: 2752; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 2753; GFX9-NEXT: v_readfirstlane_b32 s2, v0 2754; GFX9-NEXT: v_mov_b32_e32 v0, v1 2755; GFX9-NEXT: v_and_b32_e32 v0, s2, v0 2756; GFX9-NEXT: s_mov_b32 s3, 0xf000 2757; GFX9-NEXT: s_mov_b32 s2, -1 2758; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2759; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 2760; GFX9-NEXT: s_endpgm 2761; 2762; GFX1064-LABEL: and_i32_varying: 2763; GFX1064: ; %bb.0: ; %entry 2764; GFX1064-NEXT: v_mov_b32_e32 v1, v0 2765; GFX1064-NEXT: s_not_b64 exec, exec 2766; GFX1064-NEXT: v_mov_b32_e32 v1, -1 2767; GFX1064-NEXT: s_not_b64 exec, exec 2768; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2769; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2770; GFX1064-NEXT: v_mov_b32_e32 v3, -1 2771; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 2772; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 2773; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 2774; GFX1064-NEXT: v_mov_b32_e32 v2, v1 2775; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2776; GFX1064-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2777; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 2778; GFX1064-NEXT: v_mov_b32_e32 v2, s4 2779; GFX1064-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 2780; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 2781; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2782; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2783; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2784; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2785; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 2786; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 2787; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2788; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 2789; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2790; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 2791; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 2792; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 2793; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2794; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 2795; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 2796; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 2797; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 2798; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2799; GFX1064-NEXT: s_mov_b32 s2, -1 2800; GFX1064-NEXT: ; implicit-def: $vgpr0 2801; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 2802; GFX1064-NEXT: s_cbranch_execz BB14_2 2803; GFX1064-NEXT: ; %bb.1: 2804; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 2805; GFX1064-NEXT: v_mov_b32_e32 v4, s7 2806; GFX1064-NEXT: s_mov_b32 s3, s7 2807; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2808; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2809; GFX1064-NEXT: ds_and_rtn_b32 v0, v7, v4 2810; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2811; GFX1064-NEXT: buffer_gl0_inv 2812; GFX1064-NEXT: buffer_gl1_inv 2813; GFX1064-NEXT: BB14_2: 2814; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2815; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 2816; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 2817; GFX1064-NEXT: v_mov_b32_e32 v0, v3 2818; GFX1064-NEXT: v_and_b32_e32 v0, s3, v0 2819; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2820; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2821; GFX1064-NEXT: s_nop 0 2822; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 2823; GFX1064-NEXT: s_endpgm 2824; 2825; GFX1032-LABEL: and_i32_varying: 2826; GFX1032: ; %bb.0: ; %entry 2827; GFX1032-NEXT: v_mov_b32_e32 v1, v0 2828; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2829; GFX1032-NEXT: v_mov_b32_e32 v1, -1 2830; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2831; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2832; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2833; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 2834; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 2835; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 2836; GFX1032-NEXT: v_mov_b32_e32 v2, v1 2837; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2838; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2839; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2840; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2841; GFX1032-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2842; GFX1032-NEXT: v_mov_b32_e32 v3, -1 2843; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 2844; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 2845; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2846; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2847; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 2848; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2849; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 2850; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2851; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2852; GFX1032-NEXT: s_mov_b32 s2, -1 2853; GFX1032-NEXT: ; implicit-def: $vgpr0 2854; GFX1032-NEXT: ; implicit-def: $vcc_hi 2855; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 2856; GFX1032-NEXT: s_cbranch_execz BB14_2 2857; GFX1032-NEXT: ; %bb.1: 2858; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 2859; GFX1032-NEXT: v_mov_b32_e32 v4, s4 2860; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2861; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2862; GFX1032-NEXT: ds_and_rtn_b32 v0, v7, v4 2863; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2864; GFX1032-NEXT: buffer_gl0_inv 2865; GFX1032-NEXT: buffer_gl1_inv 2866; GFX1032-NEXT: BB14_2: 2867; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 2868; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 2869; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 2870; GFX1032-NEXT: v_mov_b32_e32 v0, v3 2871; GFX1032-NEXT: v_and_b32_e32 v0, s3, v0 2872; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2873; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2874; GFX1032-NEXT: s_nop 0 2875; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 2876; GFX1032-NEXT: s_endpgm 2877entry: 2878 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2879 %old = atomicrmw and i32 addrspace(3)* @local_var32, i32 %lane acq_rel 2880 store i32 %old, i32 addrspace(1)* %out 2881 ret void 2882} 2883 2884define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { 2885; 2886; 2887; GFX7LESS-LABEL: or_i32_varying: 2888; GFX7LESS: ; %bb.0: ; %entry 2889; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2890; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 2891; GFX7LESS-NEXT: s_mov_b32 m0, -1 2892; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2893; GFX7LESS-NEXT: ds_or_rtn_b32 v0, v1, v0 2894; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2895; GFX7LESS-NEXT: buffer_wbinvl1 2896; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2897; GFX7LESS-NEXT: s_mov_b32 s2, -1 2898; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 2899; GFX7LESS-NEXT: s_endpgm 2900; 2901; GFX8-LABEL: or_i32_varying: 2902; GFX8: ; %bb.0: ; %entry 2903; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2904; GFX8-NEXT: v_mov_b32_e32 v2, v0 2905; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2906; GFX8-NEXT: v_mov_b32_e32 v1, 0 2907; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2908; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2909; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2910; GFX8-NEXT: s_not_b64 exec, exec 2911; GFX8-NEXT: v_mov_b32_e32 v2, 0 2912; GFX8-NEXT: s_not_b64 exec, exec 2913; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2914; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 2915; GFX8-NEXT: s_nop 1 2916; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 2917; GFX8-NEXT: s_nop 1 2918; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 2919; GFX8-NEXT: s_nop 1 2920; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 2921; GFX8-NEXT: s_nop 1 2922; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2923; GFX8-NEXT: s_nop 1 2924; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2925; GFX8-NEXT: v_readlane_b32 s4, v2, 63 2926; GFX8-NEXT: s_nop 0 2927; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2928; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2929; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2930; GFX8-NEXT: ; implicit-def: $vgpr0 2931; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 2932; GFX8-NEXT: s_cbranch_execz BB15_2 2933; GFX8-NEXT: ; %bb.1: 2934; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 2935; GFX8-NEXT: v_mov_b32_e32 v3, s4 2936; GFX8-NEXT: s_mov_b32 m0, -1 2937; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2938; GFX8-NEXT: ds_or_rtn_b32 v0, v0, v3 2939; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2940; GFX8-NEXT: buffer_wbinvl1_vol 2941; GFX8-NEXT: BB15_2: 2942; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 2943; GFX8-NEXT: v_readfirstlane_b32 s2, v0 2944; GFX8-NEXT: v_mov_b32_e32 v0, v1 2945; GFX8-NEXT: v_or_b32_e32 v0, s2, v0 2946; GFX8-NEXT: s_mov_b32 s3, 0xf000 2947; GFX8-NEXT: s_mov_b32 s2, -1 2948; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2949; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 2950; GFX8-NEXT: s_endpgm 2951; 2952; GFX9-LABEL: or_i32_varying: 2953; GFX9: ; %bb.0: ; %entry 2954; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2955; GFX9-NEXT: v_mov_b32_e32 v2, v0 2956; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2957; GFX9-NEXT: v_mov_b32_e32 v1, 0 2958; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2959; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2960; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2961; GFX9-NEXT: s_not_b64 exec, exec 2962; GFX9-NEXT: v_mov_b32_e32 v2, 0 2963; GFX9-NEXT: s_not_b64 exec, exec 2964; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2965; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 2966; GFX9-NEXT: s_nop 1 2967; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 2968; GFX9-NEXT: s_nop 1 2969; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 2970; GFX9-NEXT: s_nop 1 2971; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 2972; GFX9-NEXT: s_nop 1 2973; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2974; GFX9-NEXT: s_nop 1 2975; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2976; GFX9-NEXT: v_readlane_b32 s4, v2, 63 2977; GFX9-NEXT: s_nop 0 2978; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2979; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2980; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2981; GFX9-NEXT: ; implicit-def: $vgpr0 2982; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 2983; GFX9-NEXT: s_cbranch_execz BB15_2 2984; GFX9-NEXT: ; %bb.1: 2985; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 2986; GFX9-NEXT: v_mov_b32_e32 v3, s4 2987; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2988; GFX9-NEXT: ds_or_rtn_b32 v0, v0, v3 2989; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2990; GFX9-NEXT: buffer_wbinvl1_vol 2991; GFX9-NEXT: BB15_2: 2992; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 2993; GFX9-NEXT: v_readfirstlane_b32 s2, v0 2994; GFX9-NEXT: v_mov_b32_e32 v0, v1 2995; GFX9-NEXT: v_or_b32_e32 v0, s2, v0 2996; GFX9-NEXT: s_mov_b32 s3, 0xf000 2997; GFX9-NEXT: s_mov_b32 s2, -1 2998; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2999; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 3000; GFX9-NEXT: s_endpgm 3001; 3002; GFX1064-LABEL: or_i32_varying: 3003; GFX1064: ; %bb.0: ; %entry 3004; GFX1064-NEXT: v_mov_b32_e32 v1, v0 3005; GFX1064-NEXT: s_not_b64 exec, exec 3006; GFX1064-NEXT: v_mov_b32_e32 v1, 0 3007; GFX1064-NEXT: s_not_b64 exec, exec 3008; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3009; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 3010; GFX1064-NEXT: v_mov_b32_e32 v3, 0 3011; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 3012; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 3013; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 3014; GFX1064-NEXT: v_mov_b32_e32 v2, v1 3015; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 3016; GFX1064-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3017; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 3018; GFX1064-NEXT: v_mov_b32_e32 v2, s4 3019; GFX1064-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3020; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 3021; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3022; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3023; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3024; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3025; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 3026; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 3027; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3028; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3029; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3030; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 3031; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 3032; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 3033; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3034; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 3035; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 3036; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 3037; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 3038; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3039; GFX1064-NEXT: s_mov_b32 s2, -1 3040; GFX1064-NEXT: ; implicit-def: $vgpr0 3041; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 3042; GFX1064-NEXT: s_cbranch_execz BB15_2 3043; GFX1064-NEXT: ; %bb.1: 3044; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 3045; GFX1064-NEXT: v_mov_b32_e32 v4, s7 3046; GFX1064-NEXT: s_mov_b32 s3, s7 3047; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3048; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3049; GFX1064-NEXT: ds_or_rtn_b32 v0, v7, v4 3050; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3051; GFX1064-NEXT: buffer_gl0_inv 3052; GFX1064-NEXT: buffer_gl1_inv 3053; GFX1064-NEXT: BB15_2: 3054; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3055; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 3056; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 3057; GFX1064-NEXT: v_mov_b32_e32 v0, v3 3058; GFX1064-NEXT: v_or_b32_e32 v0, s3, v0 3059; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3060; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3061; GFX1064-NEXT: s_nop 0 3062; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 3063; GFX1064-NEXT: s_endpgm 3064; 3065; GFX1032-LABEL: or_i32_varying: 3066; GFX1032: ; %bb.0: ; %entry 3067; GFX1032-NEXT: v_mov_b32_e32 v1, v0 3068; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3069; GFX1032-NEXT: v_mov_b32_e32 v1, 0 3070; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3071; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3072; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 3073; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 3074; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 3075; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 3076; GFX1032-NEXT: v_mov_b32_e32 v2, v1 3077; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 3078; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3079; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3080; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3081; GFX1032-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3082; GFX1032-NEXT: v_mov_b32_e32 v3, 0 3083; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 3084; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 3085; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3086; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3087; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3088; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3089; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 3090; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3091; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3092; GFX1032-NEXT: s_mov_b32 s2, -1 3093; GFX1032-NEXT: ; implicit-def: $vgpr0 3094; GFX1032-NEXT: ; implicit-def: $vcc_hi 3095; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 3096; GFX1032-NEXT: s_cbranch_execz BB15_2 3097; GFX1032-NEXT: ; %bb.1: 3098; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 3099; GFX1032-NEXT: v_mov_b32_e32 v4, s4 3100; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3101; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3102; GFX1032-NEXT: ds_or_rtn_b32 v0, v7, v4 3103; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3104; GFX1032-NEXT: buffer_gl0_inv 3105; GFX1032-NEXT: buffer_gl1_inv 3106; GFX1032-NEXT: BB15_2: 3107; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3108; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 3109; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 3110; GFX1032-NEXT: v_mov_b32_e32 v0, v3 3111; GFX1032-NEXT: v_or_b32_e32 v0, s3, v0 3112; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3113; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3114; GFX1032-NEXT: s_nop 0 3115; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 3116; GFX1032-NEXT: s_endpgm 3117entry: 3118 %lane = call i32 @llvm.amdgcn.workitem.id.x() 3119 %old = atomicrmw or i32 addrspace(3)* @local_var32, i32 %lane acq_rel 3120 store i32 %old, i32 addrspace(1)* %out 3121 ret void 3122} 3123 3124define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { 3125; 3126; 3127; GFX7LESS-LABEL: xor_i32_varying: 3128; GFX7LESS: ; %bb.0: ; %entry 3129; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3130; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 3131; GFX7LESS-NEXT: s_mov_b32 m0, -1 3132; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3133; GFX7LESS-NEXT: ds_xor_rtn_b32 v0, v1, v0 3134; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3135; GFX7LESS-NEXT: buffer_wbinvl1 3136; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3137; GFX7LESS-NEXT: s_mov_b32 s2, -1 3138; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 3139; GFX7LESS-NEXT: s_endpgm 3140; 3141; GFX8-LABEL: xor_i32_varying: 3142; GFX8: ; %bb.0: ; %entry 3143; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3144; GFX8-NEXT: v_mov_b32_e32 v2, v0 3145; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3146; GFX8-NEXT: v_mov_b32_e32 v1, 0 3147; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3148; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3149; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3150; GFX8-NEXT: s_not_b64 exec, exec 3151; GFX8-NEXT: v_mov_b32_e32 v2, 0 3152; GFX8-NEXT: s_not_b64 exec, exec 3153; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3154; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 3155; GFX8-NEXT: s_nop 1 3156; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 3157; GFX8-NEXT: s_nop 1 3158; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 3159; GFX8-NEXT: s_nop 1 3160; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 3161; GFX8-NEXT: s_nop 1 3162; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3163; GFX8-NEXT: s_nop 1 3164; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3165; GFX8-NEXT: v_readlane_b32 s4, v2, 63 3166; GFX8-NEXT: s_nop 0 3167; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3168; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3169; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3170; GFX8-NEXT: ; implicit-def: $vgpr0 3171; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 3172; GFX8-NEXT: s_cbranch_execz BB16_2 3173; GFX8-NEXT: ; %bb.1: 3174; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3175; GFX8-NEXT: v_mov_b32_e32 v3, s4 3176; GFX8-NEXT: s_mov_b32 m0, -1 3177; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3178; GFX8-NEXT: ds_xor_rtn_b32 v0, v0, v3 3179; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3180; GFX8-NEXT: buffer_wbinvl1_vol 3181; GFX8-NEXT: BB16_2: 3182; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 3183; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3184; GFX8-NEXT: v_mov_b32_e32 v0, v1 3185; GFX8-NEXT: v_xor_b32_e32 v0, s2, v0 3186; GFX8-NEXT: s_mov_b32 s3, 0xf000 3187; GFX8-NEXT: s_mov_b32 s2, -1 3188; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3189; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 3190; GFX8-NEXT: s_endpgm 3191; 3192; GFX9-LABEL: xor_i32_varying: 3193; GFX9: ; %bb.0: ; %entry 3194; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3195; GFX9-NEXT: v_mov_b32_e32 v2, v0 3196; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3197; GFX9-NEXT: v_mov_b32_e32 v1, 0 3198; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3199; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3200; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3201; GFX9-NEXT: s_not_b64 exec, exec 3202; GFX9-NEXT: v_mov_b32_e32 v2, 0 3203; GFX9-NEXT: s_not_b64 exec, exec 3204; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3205; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 3206; GFX9-NEXT: s_nop 1 3207; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 3208; GFX9-NEXT: s_nop 1 3209; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 3210; GFX9-NEXT: s_nop 1 3211; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 3212; GFX9-NEXT: s_nop 1 3213; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3214; GFX9-NEXT: s_nop 1 3215; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3216; GFX9-NEXT: v_readlane_b32 s4, v2, 63 3217; GFX9-NEXT: s_nop 0 3218; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3219; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3220; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3221; GFX9-NEXT: ; implicit-def: $vgpr0 3222; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 3223; GFX9-NEXT: s_cbranch_execz BB16_2 3224; GFX9-NEXT: ; %bb.1: 3225; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3226; GFX9-NEXT: v_mov_b32_e32 v3, s4 3227; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3228; GFX9-NEXT: ds_xor_rtn_b32 v0, v0, v3 3229; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3230; GFX9-NEXT: buffer_wbinvl1_vol 3231; GFX9-NEXT: BB16_2: 3232; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 3233; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3234; GFX9-NEXT: v_mov_b32_e32 v0, v1 3235; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 3236; GFX9-NEXT: s_mov_b32 s3, 0xf000 3237; GFX9-NEXT: s_mov_b32 s2, -1 3238; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3239; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 3240; GFX9-NEXT: s_endpgm 3241; 3242; GFX1064-LABEL: xor_i32_varying: 3243; GFX1064: ; %bb.0: ; %entry 3244; GFX1064-NEXT: v_mov_b32_e32 v1, v0 3245; GFX1064-NEXT: s_not_b64 exec, exec 3246; GFX1064-NEXT: v_mov_b32_e32 v1, 0 3247; GFX1064-NEXT: s_not_b64 exec, exec 3248; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3249; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 3250; GFX1064-NEXT: v_mov_b32_e32 v3, 0 3251; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 3252; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 3253; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 3254; GFX1064-NEXT: v_mov_b32_e32 v2, v1 3255; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 3256; GFX1064-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3257; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 3258; GFX1064-NEXT: v_mov_b32_e32 v2, s4 3259; GFX1064-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3260; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 3261; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3262; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3263; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3264; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3265; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 3266; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 3267; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3268; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3269; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3270; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 3271; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 3272; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 3273; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3274; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 3275; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 3276; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 3277; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 3278; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3279; GFX1064-NEXT: s_mov_b32 s2, -1 3280; GFX1064-NEXT: ; implicit-def: $vgpr0 3281; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 3282; GFX1064-NEXT: s_cbranch_execz BB16_2 3283; GFX1064-NEXT: ; %bb.1: 3284; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 3285; GFX1064-NEXT: v_mov_b32_e32 v4, s7 3286; GFX1064-NEXT: s_mov_b32 s3, s7 3287; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3288; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3289; GFX1064-NEXT: ds_xor_rtn_b32 v0, v7, v4 3290; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3291; GFX1064-NEXT: buffer_gl0_inv 3292; GFX1064-NEXT: buffer_gl1_inv 3293; GFX1064-NEXT: BB16_2: 3294; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3295; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 3296; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 3297; GFX1064-NEXT: v_mov_b32_e32 v0, v3 3298; GFX1064-NEXT: v_xor_b32_e32 v0, s3, v0 3299; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3300; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3301; GFX1064-NEXT: s_nop 0 3302; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 3303; GFX1064-NEXT: s_endpgm 3304; 3305; GFX1032-LABEL: xor_i32_varying: 3306; GFX1032: ; %bb.0: ; %entry 3307; GFX1032-NEXT: v_mov_b32_e32 v1, v0 3308; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3309; GFX1032-NEXT: v_mov_b32_e32 v1, 0 3310; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3311; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3312; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 3313; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 3314; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 3315; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 3316; GFX1032-NEXT: v_mov_b32_e32 v2, v1 3317; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 3318; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3319; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3320; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3321; GFX1032-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3322; GFX1032-NEXT: v_mov_b32_e32 v3, 0 3323; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 3324; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 3325; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3326; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3327; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3328; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3329; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 3330; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3331; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3332; GFX1032-NEXT: s_mov_b32 s2, -1 3333; GFX1032-NEXT: ; implicit-def: $vgpr0 3334; GFX1032-NEXT: ; implicit-def: $vcc_hi 3335; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 3336; GFX1032-NEXT: s_cbranch_execz BB16_2 3337; GFX1032-NEXT: ; %bb.1: 3338; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 3339; GFX1032-NEXT: v_mov_b32_e32 v4, s4 3340; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3341; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3342; GFX1032-NEXT: ds_xor_rtn_b32 v0, v7, v4 3343; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3344; GFX1032-NEXT: buffer_gl0_inv 3345; GFX1032-NEXT: buffer_gl1_inv 3346; GFX1032-NEXT: BB16_2: 3347; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3348; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 3349; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 3350; GFX1032-NEXT: v_mov_b32_e32 v0, v3 3351; GFX1032-NEXT: v_xor_b32_e32 v0, s3, v0 3352; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3353; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3354; GFX1032-NEXT: s_nop 0 3355; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 3356; GFX1032-NEXT: s_endpgm 3357entry: 3358 %lane = call i32 @llvm.amdgcn.workitem.id.x() 3359 %old = atomicrmw xor i32 addrspace(3)* @local_var32, i32 %lane acq_rel 3360 store i32 %old, i32 addrspace(1)* %out 3361 ret void 3362} 3363 3364define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { 3365; 3366; 3367; GFX7LESS-LABEL: max_i32_varying: 3368; GFX7LESS: ; %bb.0: ; %entry 3369; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3370; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 3371; GFX7LESS-NEXT: s_mov_b32 m0, -1 3372; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3373; GFX7LESS-NEXT: ds_max_rtn_i32 v0, v1, v0 3374; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3375; GFX7LESS-NEXT: buffer_wbinvl1 3376; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3377; GFX7LESS-NEXT: s_mov_b32 s2, -1 3378; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 3379; GFX7LESS-NEXT: s_endpgm 3380; 3381; GFX8-LABEL: max_i32_varying: 3382; GFX8: ; %bb.0: ; %entry 3383; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3384; GFX8-NEXT: v_mov_b32_e32 v2, v0 3385; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3386; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3387; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3388; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 3389; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3390; GFX8-NEXT: s_not_b64 exec, exec 3391; GFX8-NEXT: v_mov_b32_e32 v2, v1 3392; GFX8-NEXT: s_not_b64 exec, exec 3393; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3394; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3395; GFX8-NEXT: s_nop 1 3396; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3397; GFX8-NEXT: s_nop 1 3398; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3399; GFX8-NEXT: s_nop 1 3400; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3401; GFX8-NEXT: s_nop 1 3402; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3403; GFX8-NEXT: s_nop 1 3404; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3405; GFX8-NEXT: v_readlane_b32 s4, v2, 63 3406; GFX8-NEXT: s_nop 0 3407; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3408; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3409; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3410; GFX8-NEXT: ; implicit-def: $vgpr0 3411; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 3412; GFX8-NEXT: s_cbranch_execz BB17_2 3413; GFX8-NEXT: ; %bb.1: 3414; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3415; GFX8-NEXT: v_mov_b32_e32 v3, s4 3416; GFX8-NEXT: s_mov_b32 m0, -1 3417; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3418; GFX8-NEXT: ds_max_rtn_i32 v0, v0, v3 3419; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3420; GFX8-NEXT: buffer_wbinvl1_vol 3421; GFX8-NEXT: BB17_2: 3422; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 3423; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3424; GFX8-NEXT: v_mov_b32_e32 v0, v1 3425; GFX8-NEXT: v_max_i32_e32 v0, s2, v0 3426; GFX8-NEXT: s_mov_b32 s3, 0xf000 3427; GFX8-NEXT: s_mov_b32 s2, -1 3428; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3429; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 3430; GFX8-NEXT: s_endpgm 3431; 3432; GFX9-LABEL: max_i32_varying: 3433; GFX9: ; %bb.0: ; %entry 3434; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3435; GFX9-NEXT: v_mov_b32_e32 v2, v0 3436; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3437; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3438; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3439; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 3440; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3441; GFX9-NEXT: s_not_b64 exec, exec 3442; GFX9-NEXT: v_mov_b32_e32 v2, v1 3443; GFX9-NEXT: s_not_b64 exec, exec 3444; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3445; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3446; GFX9-NEXT: s_nop 1 3447; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3448; GFX9-NEXT: s_nop 1 3449; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3450; GFX9-NEXT: s_nop 1 3451; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3452; GFX9-NEXT: s_nop 1 3453; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3454; GFX9-NEXT: s_nop 1 3455; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3456; GFX9-NEXT: v_readlane_b32 s4, v2, 63 3457; GFX9-NEXT: s_nop 0 3458; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3459; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3460; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3461; GFX9-NEXT: ; implicit-def: $vgpr0 3462; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 3463; GFX9-NEXT: s_cbranch_execz BB17_2 3464; GFX9-NEXT: ; %bb.1: 3465; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3466; GFX9-NEXT: v_mov_b32_e32 v3, s4 3467; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3468; GFX9-NEXT: ds_max_rtn_i32 v0, v0, v3 3469; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3470; GFX9-NEXT: buffer_wbinvl1_vol 3471; GFX9-NEXT: BB17_2: 3472; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 3473; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3474; GFX9-NEXT: v_mov_b32_e32 v0, v1 3475; GFX9-NEXT: v_max_i32_e32 v0, s2, v0 3476; GFX9-NEXT: s_mov_b32 s3, 0xf000 3477; GFX9-NEXT: s_mov_b32 s2, -1 3478; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3479; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 3480; GFX9-NEXT: s_endpgm 3481; 3482; GFX1064-LABEL: max_i32_varying: 3483; GFX1064: ; %bb.0: ; %entry 3484; GFX1064-NEXT: v_mov_b32_e32 v2, v0 3485; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3486; GFX1064-NEXT: v_bfrev_b32_e32 v1, 1 3487; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3488; GFX1064-NEXT: s_not_b64 exec, exec 3489; GFX1064-NEXT: v_mov_b32_e32 v2, v1 3490; GFX1064-NEXT: s_not_b64 exec, exec 3491; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3492; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3493; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3494; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3495; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3496; GFX1064-NEXT: v_mov_b32_e32 v3, v2 3497; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 3498; GFX1064-NEXT: v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3499; GFX1064-NEXT: v_readlane_b32 s4, v2, 31 3500; GFX1064-NEXT: v_mov_b32_e32 v3, s4 3501; GFX1064-NEXT: v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3502; GFX1064-NEXT: v_readlane_b32 s4, v2, 15 3503; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3504; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3505; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3506; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3507; GFX1064-NEXT: v_readlane_b32 s5, v2, 31 3508; GFX1064-NEXT: v_writelane_b32 v1, s4, 16 3509; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3510; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3511; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3512; GFX1064-NEXT: v_readlane_b32 s7, v2, 63 3513; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 3514; GFX1064-NEXT: v_writelane_b32 v1, s5, 32 3515; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3516; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 3517; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 3518; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 3519; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 3520; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3521; GFX1064-NEXT: s_mov_b32 s2, -1 3522; GFX1064-NEXT: ; implicit-def: $vgpr0 3523; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 3524; GFX1064-NEXT: s_cbranch_execz BB17_2 3525; GFX1064-NEXT: ; %bb.1: 3526; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 3527; GFX1064-NEXT: v_mov_b32_e32 v4, s7 3528; GFX1064-NEXT: s_mov_b32 s3, s7 3529; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3530; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3531; GFX1064-NEXT: ds_max_rtn_i32 v0, v7, v4 3532; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3533; GFX1064-NEXT: buffer_gl0_inv 3534; GFX1064-NEXT: buffer_gl1_inv 3535; GFX1064-NEXT: BB17_2: 3536; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3537; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 3538; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 3539; GFX1064-NEXT: v_mov_b32_e32 v0, v1 3540; GFX1064-NEXT: v_max_i32_e32 v0, s3, v0 3541; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3542; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3543; GFX1064-NEXT: s_nop 0 3544; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 3545; GFX1064-NEXT: s_endpgm 3546; 3547; GFX1032-LABEL: max_i32_varying: 3548; GFX1032: ; %bb.0: ; %entry 3549; GFX1032-NEXT: v_mov_b32_e32 v2, v0 3550; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3551; GFX1032-NEXT: v_bfrev_b32_e32 v1, 1 3552; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3553; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3554; GFX1032-NEXT: v_mov_b32_e32 v2, v1 3555; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3556; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3557; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3558; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3559; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3560; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3561; GFX1032-NEXT: v_mov_b32_e32 v3, v2 3562; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 3563; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3564; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3565; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3566; GFX1032-NEXT: v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3567; GFX1032-NEXT: v_readlane_b32 s3, v2, 15 3568; GFX1032-NEXT: v_readlane_b32 s4, v2, 31 3569; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3570; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3571; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3572; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3573; GFX1032-NEXT: v_writelane_b32 v1, s3, 16 3574; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3575; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3576; GFX1032-NEXT: s_mov_b32 s2, -1 3577; GFX1032-NEXT: ; implicit-def: $vgpr0 3578; GFX1032-NEXT: ; implicit-def: $vcc_hi 3579; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 3580; GFX1032-NEXT: s_cbranch_execz BB17_2 3581; GFX1032-NEXT: ; %bb.1: 3582; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 3583; GFX1032-NEXT: v_mov_b32_e32 v4, s4 3584; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3585; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3586; GFX1032-NEXT: ds_max_rtn_i32 v0, v7, v4 3587; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3588; GFX1032-NEXT: buffer_gl0_inv 3589; GFX1032-NEXT: buffer_gl1_inv 3590; GFX1032-NEXT: BB17_2: 3591; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3592; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 3593; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 3594; GFX1032-NEXT: v_mov_b32_e32 v0, v1 3595; GFX1032-NEXT: v_max_i32_e32 v0, s3, v0 3596; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3597; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3598; GFX1032-NEXT: s_nop 0 3599; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 3600; GFX1032-NEXT: s_endpgm 3601entry: 3602 %lane = call i32 @llvm.amdgcn.workitem.id.x() 3603 %old = atomicrmw max i32 addrspace(3)* @local_var32, i32 %lane acq_rel 3604 store i32 %old, i32 addrspace(1)* %out 3605 ret void 3606} 3607 3608define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { 3609; 3610; 3611; GFX7LESS-LABEL: max_i64_constant: 3612; GFX7LESS: ; %bb.0: ; %entry 3613; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3614; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3615; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 3616; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3617; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 3618; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 3619; GFX7LESS-NEXT: s_cbranch_execz BB18_2 3620; GFX7LESS-NEXT: ; %bb.1: 3621; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3622; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 3623; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 3624; GFX7LESS-NEXT: s_mov_b32 m0, -1 3625; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3626; GFX7LESS-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3627; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3628; GFX7LESS-NEXT: buffer_wbinvl1 3629; GFX7LESS-NEXT: BB18_2: 3630; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 3631; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 3632; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 3633; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, 1 3634; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 3635; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3636; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 3637; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 3638; GFX7LESS-NEXT: v_mov_b32_e32 v3, s4 3639; GFX7LESS-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1] 3640; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3641; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 3642; GFX7LESS-NEXT: s_mov_b32 s2, -1 3643; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3644; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3645; GFX7LESS-NEXT: s_endpgm 3646; 3647; GFX8-LABEL: max_i64_constant: 3648; GFX8: ; %bb.0: ; %entry 3649; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3650; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3651; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3652; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3653; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 3654; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 3655; GFX8-NEXT: s_cbranch_execz BB18_2 3656; GFX8-NEXT: ; %bb.1: 3657; GFX8-NEXT: v_mov_b32_e32 v0, 5 3658; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3659; GFX8-NEXT: v_mov_b32_e32 v1, 0 3660; GFX8-NEXT: s_mov_b32 m0, -1 3661; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3662; GFX8-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3663; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3664; GFX8-NEXT: buffer_wbinvl1_vol 3665; GFX8-NEXT: BB18_2: 3666; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 3667; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3668; GFX8-NEXT: v_bfrev_b32_e32 v0, 1 3669; GFX8-NEXT: v_readfirstlane_b32 s3, v1 3670; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 3671; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 3672; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] 3673; GFX8-NEXT: v_mov_b32_e32 v2, s3 3674; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3675; GFX8-NEXT: v_mov_b32_e32 v2, s2 3676; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 3677; GFX8-NEXT: s_mov_b32 s3, 0xf000 3678; GFX8-NEXT: s_mov_b32 s2, -1 3679; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3680; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3681; GFX8-NEXT: s_endpgm 3682; 3683; GFX9-LABEL: max_i64_constant: 3684; GFX9: ; %bb.0: ; %entry 3685; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3686; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3687; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3688; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3689; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 3690; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 3691; GFX9-NEXT: s_cbranch_execz BB18_2 3692; GFX9-NEXT: ; %bb.1: 3693; GFX9-NEXT: v_mov_b32_e32 v0, 5 3694; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3695; GFX9-NEXT: v_mov_b32_e32 v1, 0 3696; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3697; GFX9-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3698; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3699; GFX9-NEXT: buffer_wbinvl1_vol 3700; GFX9-NEXT: BB18_2: 3701; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 3702; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3703; GFX9-NEXT: v_bfrev_b32_e32 v0, 1 3704; GFX9-NEXT: v_readfirstlane_b32 s3, v1 3705; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 3706; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 3707; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] 3708; GFX9-NEXT: v_mov_b32_e32 v2, s3 3709; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3710; GFX9-NEXT: v_mov_b32_e32 v2, s2 3711; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 3712; GFX9-NEXT: s_mov_b32 s3, 0xf000 3713; GFX9-NEXT: s_mov_b32 s2, -1 3714; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3715; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3716; GFX9-NEXT: s_endpgm 3717; 3718; GFX1064-LABEL: max_i64_constant: 3719; GFX1064: ; %bb.0: ; %entry 3720; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3721; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3722; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 3723; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3724; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 3725; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 3726; GFX1064-NEXT: s_cbranch_execz BB18_2 3727; GFX1064-NEXT: ; %bb.1: 3728; GFX1064-NEXT: v_mov_b32_e32 v0, 5 3729; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3730; GFX1064-NEXT: v_mov_b32_e32 v1, 0 3731; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3732; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3733; GFX1064-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3734; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3735; GFX1064-NEXT: buffer_gl0_inv 3736; GFX1064-NEXT: buffer_gl1_inv 3737; GFX1064-NEXT: BB18_2: 3738; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3739; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 3740; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 3741; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 3742; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc 3743; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 3744; GFX1064-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] 3745; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 3746; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 3747; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3748; GFX1064-NEXT: s_mov_b32 s2, -1 3749; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3750; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3751; GFX1064-NEXT: s_endpgm 3752; 3753; GFX1032-LABEL: max_i64_constant: 3754; GFX1032: ; %bb.0: ; %entry 3755; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3756; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3757; GFX1032-NEXT: ; implicit-def: $vcc_hi 3758; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3759; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 3760; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 3761; GFX1032-NEXT: s_cbranch_execz BB18_2 3762; GFX1032-NEXT: ; %bb.1: 3763; GFX1032-NEXT: v_mov_b32_e32 v0, 5 3764; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3765; GFX1032-NEXT: v_mov_b32_e32 v1, 0 3766; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3767; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3768; GFX1032-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3769; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3770; GFX1032-NEXT: buffer_gl0_inv 3771; GFX1032-NEXT: buffer_gl1_inv 3772; GFX1032-NEXT: BB18_2: 3773; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3774; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 3775; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 3776; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 3777; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo 3778; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo 3779; GFX1032-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1] 3780; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 3781; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 3782; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3783; GFX1032-NEXT: s_mov_b32 s2, -1 3784; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3785; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3786; GFX1032-NEXT: s_endpgm 3787entry: 3788 %old = atomicrmw max i64 addrspace(3)* @local_var64, i64 5 acq_rel 3789 store i64 %old, i64 addrspace(1)* %out 3790 ret void 3791} 3792 3793define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { 3794; 3795; 3796; GFX7LESS-LABEL: min_i32_varying: 3797; GFX7LESS: ; %bb.0: ; %entry 3798; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3799; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 3800; GFX7LESS-NEXT: s_mov_b32 m0, -1 3801; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3802; GFX7LESS-NEXT: ds_min_rtn_i32 v0, v1, v0 3803; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3804; GFX7LESS-NEXT: buffer_wbinvl1 3805; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3806; GFX7LESS-NEXT: s_mov_b32 s2, -1 3807; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 3808; GFX7LESS-NEXT: s_endpgm 3809; 3810; GFX8-LABEL: min_i32_varying: 3811; GFX8: ; %bb.0: ; %entry 3812; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3813; GFX8-NEXT: v_mov_b32_e32 v2, v0 3814; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3815; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3816; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3817; GFX8-NEXT: v_bfrev_b32_e32 v1, -2 3818; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3819; GFX8-NEXT: s_not_b64 exec, exec 3820; GFX8-NEXT: v_mov_b32_e32 v2, v1 3821; GFX8-NEXT: s_not_b64 exec, exec 3822; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3823; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3824; GFX8-NEXT: s_nop 1 3825; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3826; GFX8-NEXT: s_nop 1 3827; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3828; GFX8-NEXT: s_nop 1 3829; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3830; GFX8-NEXT: s_nop 1 3831; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3832; GFX8-NEXT: s_nop 1 3833; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3834; GFX8-NEXT: v_readlane_b32 s4, v2, 63 3835; GFX8-NEXT: s_nop 0 3836; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3837; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3838; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3839; GFX8-NEXT: ; implicit-def: $vgpr0 3840; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 3841; GFX8-NEXT: s_cbranch_execz BB19_2 3842; GFX8-NEXT: ; %bb.1: 3843; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3844; GFX8-NEXT: v_mov_b32_e32 v3, s4 3845; GFX8-NEXT: s_mov_b32 m0, -1 3846; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3847; GFX8-NEXT: ds_min_rtn_i32 v0, v0, v3 3848; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3849; GFX8-NEXT: buffer_wbinvl1_vol 3850; GFX8-NEXT: BB19_2: 3851; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 3852; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3853; GFX8-NEXT: v_mov_b32_e32 v0, v1 3854; GFX8-NEXT: v_min_i32_e32 v0, s2, v0 3855; GFX8-NEXT: s_mov_b32 s3, 0xf000 3856; GFX8-NEXT: s_mov_b32 s2, -1 3857; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3858; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 3859; GFX8-NEXT: s_endpgm 3860; 3861; GFX9-LABEL: min_i32_varying: 3862; GFX9: ; %bb.0: ; %entry 3863; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3864; GFX9-NEXT: v_mov_b32_e32 v2, v0 3865; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3866; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3867; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3868; GFX9-NEXT: v_bfrev_b32_e32 v1, -2 3869; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3870; GFX9-NEXT: s_not_b64 exec, exec 3871; GFX9-NEXT: v_mov_b32_e32 v2, v1 3872; GFX9-NEXT: s_not_b64 exec, exec 3873; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3874; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3875; GFX9-NEXT: s_nop 1 3876; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3877; GFX9-NEXT: s_nop 1 3878; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3879; GFX9-NEXT: s_nop 1 3880; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3881; GFX9-NEXT: s_nop 1 3882; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3883; GFX9-NEXT: s_nop 1 3884; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3885; GFX9-NEXT: v_readlane_b32 s4, v2, 63 3886; GFX9-NEXT: s_nop 0 3887; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3888; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3889; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3890; GFX9-NEXT: ; implicit-def: $vgpr0 3891; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 3892; GFX9-NEXT: s_cbranch_execz BB19_2 3893; GFX9-NEXT: ; %bb.1: 3894; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3895; GFX9-NEXT: v_mov_b32_e32 v3, s4 3896; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3897; GFX9-NEXT: ds_min_rtn_i32 v0, v0, v3 3898; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3899; GFX9-NEXT: buffer_wbinvl1_vol 3900; GFX9-NEXT: BB19_2: 3901; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 3902; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3903; GFX9-NEXT: v_mov_b32_e32 v0, v1 3904; GFX9-NEXT: v_min_i32_e32 v0, s2, v0 3905; GFX9-NEXT: s_mov_b32 s3, 0xf000 3906; GFX9-NEXT: s_mov_b32 s2, -1 3907; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3908; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 3909; GFX9-NEXT: s_endpgm 3910; 3911; GFX1064-LABEL: min_i32_varying: 3912; GFX1064: ; %bb.0: ; %entry 3913; GFX1064-NEXT: v_mov_b32_e32 v2, v0 3914; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3915; GFX1064-NEXT: v_bfrev_b32_e32 v1, -2 3916; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3917; GFX1064-NEXT: s_not_b64 exec, exec 3918; GFX1064-NEXT: v_mov_b32_e32 v2, v1 3919; GFX1064-NEXT: s_not_b64 exec, exec 3920; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3921; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3922; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3923; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3924; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3925; GFX1064-NEXT: v_mov_b32_e32 v3, v2 3926; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 3927; GFX1064-NEXT: v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3928; GFX1064-NEXT: v_readlane_b32 s4, v2, 31 3929; GFX1064-NEXT: v_mov_b32_e32 v3, s4 3930; GFX1064-NEXT: v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3931; GFX1064-NEXT: v_readlane_b32 s4, v2, 15 3932; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3933; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3934; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3935; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3936; GFX1064-NEXT: v_readlane_b32 s5, v2, 31 3937; GFX1064-NEXT: v_writelane_b32 v1, s4, 16 3938; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3939; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3940; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3941; GFX1064-NEXT: v_readlane_b32 s7, v2, 63 3942; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 3943; GFX1064-NEXT: v_writelane_b32 v1, s5, 32 3944; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3945; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 3946; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 3947; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 3948; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 3949; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3950; GFX1064-NEXT: s_mov_b32 s2, -1 3951; GFX1064-NEXT: ; implicit-def: $vgpr0 3952; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 3953; GFX1064-NEXT: s_cbranch_execz BB19_2 3954; GFX1064-NEXT: ; %bb.1: 3955; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 3956; GFX1064-NEXT: v_mov_b32_e32 v4, s7 3957; GFX1064-NEXT: s_mov_b32 s3, s7 3958; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3959; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3960; GFX1064-NEXT: ds_min_rtn_i32 v0, v7, v4 3961; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3962; GFX1064-NEXT: buffer_gl0_inv 3963; GFX1064-NEXT: buffer_gl1_inv 3964; GFX1064-NEXT: BB19_2: 3965; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3966; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 3967; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 3968; GFX1064-NEXT: v_mov_b32_e32 v0, v1 3969; GFX1064-NEXT: v_min_i32_e32 v0, s3, v0 3970; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3971; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3972; GFX1064-NEXT: s_nop 0 3973; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 3974; GFX1064-NEXT: s_endpgm 3975; 3976; GFX1032-LABEL: min_i32_varying: 3977; GFX1032: ; %bb.0: ; %entry 3978; GFX1032-NEXT: v_mov_b32_e32 v2, v0 3979; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3980; GFX1032-NEXT: v_bfrev_b32_e32 v1, -2 3981; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3982; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3983; GFX1032-NEXT: v_mov_b32_e32 v2, v1 3984; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3985; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3986; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3987; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3988; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3989; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3990; GFX1032-NEXT: v_mov_b32_e32 v3, v2 3991; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 3992; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3993; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3994; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3995; GFX1032-NEXT: v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3996; GFX1032-NEXT: v_readlane_b32 s3, v2, 15 3997; GFX1032-NEXT: v_readlane_b32 s4, v2, 31 3998; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3999; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4000; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4001; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4002; GFX1032-NEXT: v_writelane_b32 v1, s3, 16 4003; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4004; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4005; GFX1032-NEXT: s_mov_b32 s2, -1 4006; GFX1032-NEXT: ; implicit-def: $vgpr0 4007; GFX1032-NEXT: ; implicit-def: $vcc_hi 4008; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 4009; GFX1032-NEXT: s_cbranch_execz BB19_2 4010; GFX1032-NEXT: ; %bb.1: 4011; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 4012; GFX1032-NEXT: v_mov_b32_e32 v4, s4 4013; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4014; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4015; GFX1032-NEXT: ds_min_rtn_i32 v0, v7, v4 4016; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4017; GFX1032-NEXT: buffer_gl0_inv 4018; GFX1032-NEXT: buffer_gl1_inv 4019; GFX1032-NEXT: BB19_2: 4020; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 4021; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 4022; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 4023; GFX1032-NEXT: v_mov_b32_e32 v0, v1 4024; GFX1032-NEXT: v_min_i32_e32 v0, s3, v0 4025; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4026; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4027; GFX1032-NEXT: s_nop 0 4028; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 4029; GFX1032-NEXT: s_endpgm 4030entry: 4031 %lane = call i32 @llvm.amdgcn.workitem.id.x() 4032 %old = atomicrmw min i32 addrspace(3)* @local_var32, i32 %lane acq_rel 4033 store i32 %old, i32 addrspace(1)* %out 4034 ret void 4035} 4036 4037define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { 4038; 4039; 4040; GFX7LESS-LABEL: min_i64_constant: 4041; GFX7LESS: ; %bb.0: ; %entry 4042; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4043; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4044; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 4045; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4046; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 4047; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 4048; GFX7LESS-NEXT: s_cbranch_execz BB20_2 4049; GFX7LESS-NEXT: ; %bb.1: 4050; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4051; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 4052; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4053; GFX7LESS-NEXT: s_mov_b32 m0, -1 4054; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4055; GFX7LESS-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 4056; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4057; GFX7LESS-NEXT: buffer_wbinvl1 4058; GFX7LESS-NEXT: BB20_2: 4059; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 4060; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 4061; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 4062; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, -2 4063; GFX7LESS-NEXT: s_mov_b32 s2, -1 4064; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4065; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 4066; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 4067; GFX7LESS-NEXT: v_mov_b32_e32 v3, s4 4068; GFX7LESS-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 4069; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4070; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 4071; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4072; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4073; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4074; GFX7LESS-NEXT: s_endpgm 4075; 4076; GFX8-LABEL: min_i64_constant: 4077; GFX8: ; %bb.0: ; %entry 4078; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4079; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4080; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4081; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4082; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 4083; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 4084; GFX8-NEXT: s_cbranch_execz BB20_2 4085; GFX8-NEXT: ; %bb.1: 4086; GFX8-NEXT: v_mov_b32_e32 v0, 5 4087; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4088; GFX8-NEXT: v_mov_b32_e32 v1, 0 4089; GFX8-NEXT: s_mov_b32 m0, -1 4090; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4091; GFX8-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 4092; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4093; GFX8-NEXT: buffer_wbinvl1_vol 4094; GFX8-NEXT: BB20_2: 4095; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 4096; GFX8-NEXT: v_readfirstlane_b32 s4, v0 4097; GFX8-NEXT: v_bfrev_b32_e32 v0, -2 4098; GFX8-NEXT: v_readfirstlane_b32 s5, v1 4099; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 4100; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4101; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 4102; GFX8-NEXT: v_mov_b32_e32 v2, s5 4103; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4104; GFX8-NEXT: v_mov_b32_e32 v2, s4 4105; GFX8-NEXT: s_mov_b32 s2, -1 4106; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4107; GFX8-NEXT: s_mov_b32 s3, 0xf000 4108; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4109; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4110; GFX8-NEXT: s_endpgm 4111; 4112; GFX9-LABEL: min_i64_constant: 4113; GFX9: ; %bb.0: ; %entry 4114; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4115; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4116; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4117; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4118; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 4119; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4120; GFX9-NEXT: s_cbranch_execz BB20_2 4121; GFX9-NEXT: ; %bb.1: 4122; GFX9-NEXT: v_mov_b32_e32 v0, 5 4123; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4124; GFX9-NEXT: v_mov_b32_e32 v1, 0 4125; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4126; GFX9-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 4127; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4128; GFX9-NEXT: buffer_wbinvl1_vol 4129; GFX9-NEXT: BB20_2: 4130; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4131; GFX9-NEXT: v_readfirstlane_b32 s4, v0 4132; GFX9-NEXT: v_bfrev_b32_e32 v0, -2 4133; GFX9-NEXT: v_readfirstlane_b32 s5, v1 4134; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 4135; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4136; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 4137; GFX9-NEXT: v_mov_b32_e32 v2, s5 4138; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4139; GFX9-NEXT: v_mov_b32_e32 v2, s4 4140; GFX9-NEXT: s_mov_b32 s2, -1 4141; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4142; GFX9-NEXT: s_mov_b32 s3, 0xf000 4143; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4144; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4145; GFX9-NEXT: s_endpgm 4146; 4147; GFX1064-LABEL: min_i64_constant: 4148; GFX1064: ; %bb.0: ; %entry 4149; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4150; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4151; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 4152; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4153; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 4154; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 4155; GFX1064-NEXT: s_cbranch_execz BB20_2 4156; GFX1064-NEXT: ; %bb.1: 4157; GFX1064-NEXT: v_mov_b32_e32 v0, 5 4158; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4159; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4160; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4161; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4162; GFX1064-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 4163; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4164; GFX1064-NEXT: buffer_gl0_inv 4165; GFX1064-NEXT: buffer_gl1_inv 4166; GFX1064-NEXT: BB20_2: 4167; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 4168; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 4169; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 4170; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 4171; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc 4172; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4173; GFX1064-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] 4174; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 4175; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 4176; GFX1064-NEXT: s_mov_b32 s2, -1 4177; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4178; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4179; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4180; GFX1064-NEXT: s_endpgm 4181; 4182; GFX1032-LABEL: min_i64_constant: 4183; GFX1032: ; %bb.0: ; %entry 4184; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4185; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4186; GFX1032-NEXT: ; implicit-def: $vcc_hi 4187; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4188; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 4189; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 4190; GFX1032-NEXT: s_cbranch_execz BB20_2 4191; GFX1032-NEXT: ; %bb.1: 4192; GFX1032-NEXT: v_mov_b32_e32 v0, 5 4193; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4194; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4195; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4196; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4197; GFX1032-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 4198; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4199; GFX1032-NEXT: buffer_gl0_inv 4200; GFX1032-NEXT: buffer_gl1_inv 4201; GFX1032-NEXT: BB20_2: 4202; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 4203; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 4204; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 4205; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 4206; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo 4207; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo 4208; GFX1032-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1] 4209; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 4210; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 4211; GFX1032-NEXT: s_mov_b32 s2, -1 4212; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4213; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4214; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4215; GFX1032-NEXT: s_endpgm 4216entry: 4217 %old = atomicrmw min i64 addrspace(3)* @local_var64, i64 5 acq_rel 4218 store i64 %old, i64 addrspace(1)* %out 4219 ret void 4220} 4221 4222define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { 4223; 4224; 4225; GFX7LESS-LABEL: umax_i32_varying: 4226; GFX7LESS: ; %bb.0: ; %entry 4227; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4228; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 4229; GFX7LESS-NEXT: s_mov_b32 m0, -1 4230; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4231; GFX7LESS-NEXT: ds_max_rtn_u32 v0, v1, v0 4232; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4233; GFX7LESS-NEXT: buffer_wbinvl1 4234; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4235; GFX7LESS-NEXT: s_mov_b32 s2, -1 4236; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 4237; GFX7LESS-NEXT: s_endpgm 4238; 4239; GFX8-LABEL: umax_i32_varying: 4240; GFX8: ; %bb.0: ; %entry 4241; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4242; GFX8-NEXT: v_mov_b32_e32 v2, v0 4243; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 4244; GFX8-NEXT: v_mov_b32_e32 v1, 0 4245; GFX8-NEXT: s_mov_b64 exec, s[2:3] 4246; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4247; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4248; GFX8-NEXT: s_not_b64 exec, exec 4249; GFX8-NEXT: v_mov_b32_e32 v2, 0 4250; GFX8-NEXT: s_not_b64 exec, exec 4251; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 4252; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 4253; GFX8-NEXT: s_nop 1 4254; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 4255; GFX8-NEXT: s_nop 1 4256; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 4257; GFX8-NEXT: s_nop 1 4258; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 4259; GFX8-NEXT: s_nop 1 4260; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4261; GFX8-NEXT: s_nop 1 4262; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4263; GFX8-NEXT: v_readlane_b32 s4, v2, 63 4264; GFX8-NEXT: s_nop 0 4265; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4266; GFX8-NEXT: s_mov_b64 exec, s[2:3] 4267; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4268; GFX8-NEXT: ; implicit-def: $vgpr0 4269; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 4270; GFX8-NEXT: s_cbranch_execz BB21_2 4271; GFX8-NEXT: ; %bb.1: 4272; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 4273; GFX8-NEXT: v_mov_b32_e32 v3, s4 4274; GFX8-NEXT: s_mov_b32 m0, -1 4275; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4276; GFX8-NEXT: ds_max_rtn_u32 v0, v0, v3 4277; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4278; GFX8-NEXT: buffer_wbinvl1_vol 4279; GFX8-NEXT: BB21_2: 4280; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 4281; GFX8-NEXT: v_readfirstlane_b32 s2, v0 4282; GFX8-NEXT: v_mov_b32_e32 v0, v1 4283; GFX8-NEXT: v_max_u32_e32 v0, s2, v0 4284; GFX8-NEXT: s_mov_b32 s3, 0xf000 4285; GFX8-NEXT: s_mov_b32 s2, -1 4286; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4287; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 4288; GFX8-NEXT: s_endpgm 4289; 4290; GFX9-LABEL: umax_i32_varying: 4291; GFX9: ; %bb.0: ; %entry 4292; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4293; GFX9-NEXT: v_mov_b32_e32 v2, v0 4294; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 4295; GFX9-NEXT: v_mov_b32_e32 v1, 0 4296; GFX9-NEXT: s_mov_b64 exec, s[2:3] 4297; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4298; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4299; GFX9-NEXT: s_not_b64 exec, exec 4300; GFX9-NEXT: v_mov_b32_e32 v2, 0 4301; GFX9-NEXT: s_not_b64 exec, exec 4302; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 4303; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 4304; GFX9-NEXT: s_nop 1 4305; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 4306; GFX9-NEXT: s_nop 1 4307; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 4308; GFX9-NEXT: s_nop 1 4309; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 4310; GFX9-NEXT: s_nop 1 4311; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4312; GFX9-NEXT: s_nop 1 4313; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4314; GFX9-NEXT: v_readlane_b32 s4, v2, 63 4315; GFX9-NEXT: s_nop 0 4316; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4317; GFX9-NEXT: s_mov_b64 exec, s[2:3] 4318; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4319; GFX9-NEXT: ; implicit-def: $vgpr0 4320; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4321; GFX9-NEXT: s_cbranch_execz BB21_2 4322; GFX9-NEXT: ; %bb.1: 4323; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 4324; GFX9-NEXT: v_mov_b32_e32 v3, s4 4325; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4326; GFX9-NEXT: ds_max_rtn_u32 v0, v0, v3 4327; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4328; GFX9-NEXT: buffer_wbinvl1_vol 4329; GFX9-NEXT: BB21_2: 4330; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4331; GFX9-NEXT: v_readfirstlane_b32 s2, v0 4332; GFX9-NEXT: v_mov_b32_e32 v0, v1 4333; GFX9-NEXT: v_max_u32_e32 v0, s2, v0 4334; GFX9-NEXT: s_mov_b32 s3, 0xf000 4335; GFX9-NEXT: s_mov_b32 s2, -1 4336; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4337; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 4338; GFX9-NEXT: s_endpgm 4339; 4340; GFX1064-LABEL: umax_i32_varying: 4341; GFX1064: ; %bb.0: ; %entry 4342; GFX1064-NEXT: v_mov_b32_e32 v1, v0 4343; GFX1064-NEXT: s_not_b64 exec, exec 4344; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4345; GFX1064-NEXT: s_not_b64 exec, exec 4346; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4347; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 4348; GFX1064-NEXT: v_mov_b32_e32 v3, 0 4349; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 4350; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 4351; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 4352; GFX1064-NEXT: v_mov_b32_e32 v2, v1 4353; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4354; GFX1064-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4355; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 4356; GFX1064-NEXT: v_mov_b32_e32 v2, s4 4357; GFX1064-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 4358; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 4359; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4360; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4361; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4362; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4363; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 4364; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 4365; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4366; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4367; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4368; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 4369; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 4370; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 4371; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4372; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 4373; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 4374; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 4375; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 4376; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4377; GFX1064-NEXT: s_mov_b32 s2, -1 4378; GFX1064-NEXT: ; implicit-def: $vgpr0 4379; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 4380; GFX1064-NEXT: s_cbranch_execz BB21_2 4381; GFX1064-NEXT: ; %bb.1: 4382; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 4383; GFX1064-NEXT: v_mov_b32_e32 v4, s7 4384; GFX1064-NEXT: s_mov_b32 s3, s7 4385; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4386; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4387; GFX1064-NEXT: ds_max_rtn_u32 v0, v7, v4 4388; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4389; GFX1064-NEXT: buffer_gl0_inv 4390; GFX1064-NEXT: buffer_gl1_inv 4391; GFX1064-NEXT: BB21_2: 4392; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 4393; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 4394; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 4395; GFX1064-NEXT: v_mov_b32_e32 v0, v3 4396; GFX1064-NEXT: v_max_u32_e32 v0, s3, v0 4397; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4398; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4399; GFX1064-NEXT: s_nop 0 4400; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 4401; GFX1064-NEXT: s_endpgm 4402; 4403; GFX1032-LABEL: umax_i32_varying: 4404; GFX1032: ; %bb.0: ; %entry 4405; GFX1032-NEXT: v_mov_b32_e32 v1, v0 4406; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4407; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4408; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4409; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4410; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 4411; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 4412; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 4413; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 4414; GFX1032-NEXT: v_mov_b32_e32 v2, v1 4415; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4416; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4417; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4418; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4419; GFX1032-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4420; GFX1032-NEXT: v_mov_b32_e32 v3, 0 4421; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 4422; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 4423; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4424; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4425; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4426; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4427; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 4428; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4429; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4430; GFX1032-NEXT: s_mov_b32 s2, -1 4431; GFX1032-NEXT: ; implicit-def: $vgpr0 4432; GFX1032-NEXT: ; implicit-def: $vcc_hi 4433; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 4434; GFX1032-NEXT: s_cbranch_execz BB21_2 4435; GFX1032-NEXT: ; %bb.1: 4436; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 4437; GFX1032-NEXT: v_mov_b32_e32 v4, s4 4438; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4439; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4440; GFX1032-NEXT: ds_max_rtn_u32 v0, v7, v4 4441; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4442; GFX1032-NEXT: buffer_gl0_inv 4443; GFX1032-NEXT: buffer_gl1_inv 4444; GFX1032-NEXT: BB21_2: 4445; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 4446; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 4447; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 4448; GFX1032-NEXT: v_mov_b32_e32 v0, v3 4449; GFX1032-NEXT: v_max_u32_e32 v0, s3, v0 4450; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4451; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4452; GFX1032-NEXT: s_nop 0 4453; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 4454; GFX1032-NEXT: s_endpgm 4455entry: 4456 %lane = call i32 @llvm.amdgcn.workitem.id.x() 4457 %old = atomicrmw umax i32 addrspace(3)* @local_var32, i32 %lane acq_rel 4458 store i32 %old, i32 addrspace(1)* %out 4459 ret void 4460} 4461 4462define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { 4463; 4464; 4465; GFX7LESS-LABEL: umax_i64_constant: 4466; GFX7LESS: ; %bb.0: ; %entry 4467; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4468; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4469; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 4470; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4471; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 4472; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 4473; GFX7LESS-NEXT: s_cbranch_execz BB22_2 4474; GFX7LESS-NEXT: ; %bb.1: 4475; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4476; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 4477; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4478; GFX7LESS-NEXT: s_mov_b32 m0, -1 4479; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4480; GFX7LESS-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4481; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4482; GFX7LESS-NEXT: buffer_wbinvl1 4483; GFX7LESS-NEXT: BB22_2: 4484; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 4485; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 4486; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 4487; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4488; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4489; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4490; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 4491; GFX7LESS-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1] 4492; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4493; GFX7LESS-NEXT: v_mov_b32_e32 v1, s5 4494; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 4495; GFX7LESS-NEXT: s_mov_b32 s2, -1 4496; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4497; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4498; GFX7LESS-NEXT: s_endpgm 4499; 4500; GFX8-LABEL: umax_i64_constant: 4501; GFX8: ; %bb.0: ; %entry 4502; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4503; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4504; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4505; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4506; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 4507; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 4508; GFX8-NEXT: s_cbranch_execz BB22_2 4509; GFX8-NEXT: ; %bb.1: 4510; GFX8-NEXT: v_mov_b32_e32 v0, 5 4511; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4512; GFX8-NEXT: v_mov_b32_e32 v1, 0 4513; GFX8-NEXT: s_mov_b32 m0, -1 4514; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4515; GFX8-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4516; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4517; GFX8-NEXT: buffer_wbinvl1_vol 4518; GFX8-NEXT: BB22_2: 4519; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 4520; GFX8-NEXT: v_readfirstlane_b32 s2, v0 4521; GFX8-NEXT: v_readfirstlane_b32 s3, v1 4522; GFX8-NEXT: v_mov_b32_e32 v1, 0 4523; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4524; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] 4525; GFX8-NEXT: v_mov_b32_e32 v1, s3 4526; GFX8-NEXT: v_mov_b32_e32 v2, s2 4527; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4528; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 4529; GFX8-NEXT: s_mov_b32 s3, 0xf000 4530; GFX8-NEXT: s_mov_b32 s2, -1 4531; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4532; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4533; GFX8-NEXT: s_endpgm 4534; 4535; GFX9-LABEL: umax_i64_constant: 4536; GFX9: ; %bb.0: ; %entry 4537; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4538; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4539; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4540; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4541; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 4542; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4543; GFX9-NEXT: s_cbranch_execz BB22_2 4544; GFX9-NEXT: ; %bb.1: 4545; GFX9-NEXT: v_mov_b32_e32 v0, 5 4546; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4547; GFX9-NEXT: v_mov_b32_e32 v1, 0 4548; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4549; GFX9-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4550; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4551; GFX9-NEXT: buffer_wbinvl1_vol 4552; GFX9-NEXT: BB22_2: 4553; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4554; GFX9-NEXT: v_readfirstlane_b32 s2, v0 4555; GFX9-NEXT: v_readfirstlane_b32 s3, v1 4556; GFX9-NEXT: v_mov_b32_e32 v1, 0 4557; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4558; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] 4559; GFX9-NEXT: v_mov_b32_e32 v1, s3 4560; GFX9-NEXT: v_mov_b32_e32 v2, s2 4561; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4562; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 4563; GFX9-NEXT: s_mov_b32 s3, 0xf000 4564; GFX9-NEXT: s_mov_b32 s2, -1 4565; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4566; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4567; GFX9-NEXT: s_endpgm 4568; 4569; GFX1064-LABEL: umax_i64_constant: 4570; GFX1064: ; %bb.0: ; %entry 4571; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4572; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4573; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 4574; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4575; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 4576; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 4577; GFX1064-NEXT: s_cbranch_execz BB22_2 4578; GFX1064-NEXT: ; %bb.1: 4579; GFX1064-NEXT: v_mov_b32_e32 v0, 5 4580; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4581; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4582; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4583; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4584; GFX1064-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4585; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4586; GFX1064-NEXT: buffer_gl0_inv 4587; GFX1064-NEXT: buffer_gl1_inv 4588; GFX1064-NEXT: BB22_2: 4589; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 4590; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 4591; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 4592; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 4593; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4594; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4595; GFX1064-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] 4596; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 4597; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc 4598; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4599; GFX1064-NEXT: s_mov_b32 s2, -1 4600; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4601; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4602; GFX1064-NEXT: s_endpgm 4603; 4604; GFX1032-LABEL: umax_i64_constant: 4605; GFX1032: ; %bb.0: ; %entry 4606; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4607; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4608; GFX1032-NEXT: ; implicit-def: $vcc_hi 4609; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4610; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 4611; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 4612; GFX1032-NEXT: s_cbranch_execz BB22_2 4613; GFX1032-NEXT: ; %bb.1: 4614; GFX1032-NEXT: v_mov_b32_e32 v0, 5 4615; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4616; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4617; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4618; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4619; GFX1032-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4620; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4621; GFX1032-NEXT: buffer_gl0_inv 4622; GFX1032-NEXT: buffer_gl1_inv 4623; GFX1032-NEXT: BB22_2: 4624; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 4625; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 4626; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 4627; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 4628; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4629; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo 4630; GFX1032-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1] 4631; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 4632; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc_lo 4633; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4634; GFX1032-NEXT: s_mov_b32 s2, -1 4635; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4636; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4637; GFX1032-NEXT: s_endpgm 4638entry: 4639 %old = atomicrmw umax i64 addrspace(3)* @local_var64, i64 5 acq_rel 4640 store i64 %old, i64 addrspace(1)* %out 4641 ret void 4642} 4643 4644define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { 4645; 4646; 4647; GFX7LESS-LABEL: umin_i32_varying: 4648; GFX7LESS: ; %bb.0: ; %entry 4649; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4650; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 4651; GFX7LESS-NEXT: s_mov_b32 m0, -1 4652; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4653; GFX7LESS-NEXT: ds_min_rtn_u32 v0, v1, v0 4654; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4655; GFX7LESS-NEXT: buffer_wbinvl1 4656; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4657; GFX7LESS-NEXT: s_mov_b32 s2, -1 4658; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 4659; GFX7LESS-NEXT: s_endpgm 4660; 4661; GFX8-LABEL: umin_i32_varying: 4662; GFX8: ; %bb.0: ; %entry 4663; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4664; GFX8-NEXT: v_mov_b32_e32 v2, v0 4665; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4666; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4667; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 4668; GFX8-NEXT: v_mov_b32_e32 v1, -1 4669; GFX8-NEXT: s_mov_b64 exec, s[2:3] 4670; GFX8-NEXT: s_not_b64 exec, exec 4671; GFX8-NEXT: v_mov_b32_e32 v2, -1 4672; GFX8-NEXT: s_not_b64 exec, exec 4673; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 4674; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4675; GFX8-NEXT: s_nop 1 4676; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 4677; GFX8-NEXT: s_nop 1 4678; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 4679; GFX8-NEXT: s_nop 1 4680; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 4681; GFX8-NEXT: s_nop 1 4682; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4683; GFX8-NEXT: s_nop 1 4684; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4685; GFX8-NEXT: v_readlane_b32 s4, v2, 63 4686; GFX8-NEXT: s_nop 0 4687; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4688; GFX8-NEXT: s_mov_b64 exec, s[2:3] 4689; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4690; GFX8-NEXT: ; implicit-def: $vgpr0 4691; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 4692; GFX8-NEXT: s_cbranch_execz BB23_2 4693; GFX8-NEXT: ; %bb.1: 4694; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 4695; GFX8-NEXT: v_mov_b32_e32 v3, s4 4696; GFX8-NEXT: s_mov_b32 m0, -1 4697; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4698; GFX8-NEXT: ds_min_rtn_u32 v0, v0, v3 4699; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4700; GFX8-NEXT: buffer_wbinvl1_vol 4701; GFX8-NEXT: BB23_2: 4702; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 4703; GFX8-NEXT: v_readfirstlane_b32 s2, v0 4704; GFX8-NEXT: v_mov_b32_e32 v0, v1 4705; GFX8-NEXT: v_min_u32_e32 v0, s2, v0 4706; GFX8-NEXT: s_mov_b32 s3, 0xf000 4707; GFX8-NEXT: s_mov_b32 s2, -1 4708; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4709; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 4710; GFX8-NEXT: s_endpgm 4711; 4712; GFX9-LABEL: umin_i32_varying: 4713; GFX9: ; %bb.0: ; %entry 4714; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4715; GFX9-NEXT: v_mov_b32_e32 v2, v0 4716; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4717; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4718; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 4719; GFX9-NEXT: v_mov_b32_e32 v1, -1 4720; GFX9-NEXT: s_mov_b64 exec, s[2:3] 4721; GFX9-NEXT: s_not_b64 exec, exec 4722; GFX9-NEXT: v_mov_b32_e32 v2, -1 4723; GFX9-NEXT: s_not_b64 exec, exec 4724; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 4725; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4726; GFX9-NEXT: s_nop 1 4727; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 4728; GFX9-NEXT: s_nop 1 4729; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 4730; GFX9-NEXT: s_nop 1 4731; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 4732; GFX9-NEXT: s_nop 1 4733; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4734; GFX9-NEXT: s_nop 1 4735; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4736; GFX9-NEXT: v_readlane_b32 s4, v2, 63 4737; GFX9-NEXT: s_nop 0 4738; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4739; GFX9-NEXT: s_mov_b64 exec, s[2:3] 4740; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4741; GFX9-NEXT: ; implicit-def: $vgpr0 4742; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4743; GFX9-NEXT: s_cbranch_execz BB23_2 4744; GFX9-NEXT: ; %bb.1: 4745; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 4746; GFX9-NEXT: v_mov_b32_e32 v3, s4 4747; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4748; GFX9-NEXT: ds_min_rtn_u32 v0, v0, v3 4749; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4750; GFX9-NEXT: buffer_wbinvl1_vol 4751; GFX9-NEXT: BB23_2: 4752; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4753; GFX9-NEXT: v_readfirstlane_b32 s2, v0 4754; GFX9-NEXT: v_mov_b32_e32 v0, v1 4755; GFX9-NEXT: v_min_u32_e32 v0, s2, v0 4756; GFX9-NEXT: s_mov_b32 s3, 0xf000 4757; GFX9-NEXT: s_mov_b32 s2, -1 4758; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4759; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 4760; GFX9-NEXT: s_endpgm 4761; 4762; GFX1064-LABEL: umin_i32_varying: 4763; GFX1064: ; %bb.0: ; %entry 4764; GFX1064-NEXT: v_mov_b32_e32 v1, v0 4765; GFX1064-NEXT: s_not_b64 exec, exec 4766; GFX1064-NEXT: v_mov_b32_e32 v1, -1 4767; GFX1064-NEXT: s_not_b64 exec, exec 4768; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4769; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4770; GFX1064-NEXT: v_mov_b32_e32 v3, -1 4771; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 4772; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 4773; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 4774; GFX1064-NEXT: v_mov_b32_e32 v2, v1 4775; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4776; GFX1064-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4777; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 4778; GFX1064-NEXT: v_mov_b32_e32 v2, s4 4779; GFX1064-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 4780; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 4781; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4782; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4783; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4784; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4785; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 4786; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 4787; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4788; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4789; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4790; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 4791; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 4792; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 4793; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4794; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 4795; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 4796; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 4797; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 4798; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4799; GFX1064-NEXT: s_mov_b32 s2, -1 4800; GFX1064-NEXT: ; implicit-def: $vgpr0 4801; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 4802; GFX1064-NEXT: s_cbranch_execz BB23_2 4803; GFX1064-NEXT: ; %bb.1: 4804; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 4805; GFX1064-NEXT: v_mov_b32_e32 v4, s7 4806; GFX1064-NEXT: s_mov_b32 s3, s7 4807; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4808; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4809; GFX1064-NEXT: ds_min_rtn_u32 v0, v7, v4 4810; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4811; GFX1064-NEXT: buffer_gl0_inv 4812; GFX1064-NEXT: buffer_gl1_inv 4813; GFX1064-NEXT: BB23_2: 4814; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 4815; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 4816; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 4817; GFX1064-NEXT: v_mov_b32_e32 v0, v3 4818; GFX1064-NEXT: v_min_u32_e32 v0, s3, v0 4819; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4820; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4821; GFX1064-NEXT: s_nop 0 4822; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 4823; GFX1064-NEXT: s_endpgm 4824; 4825; GFX1032-LABEL: umin_i32_varying: 4826; GFX1032: ; %bb.0: ; %entry 4827; GFX1032-NEXT: v_mov_b32_e32 v1, v0 4828; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4829; GFX1032-NEXT: v_mov_b32_e32 v1, -1 4830; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4831; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4832; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4833; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 4834; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 4835; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 4836; GFX1032-NEXT: v_mov_b32_e32 v2, v1 4837; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4838; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4839; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4840; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4841; GFX1032-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4842; GFX1032-NEXT: v_mov_b32_e32 v3, -1 4843; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 4844; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 4845; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4846; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4847; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4848; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4849; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 4850; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4851; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4852; GFX1032-NEXT: s_mov_b32 s2, -1 4853; GFX1032-NEXT: ; implicit-def: $vgpr0 4854; GFX1032-NEXT: ; implicit-def: $vcc_hi 4855; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 4856; GFX1032-NEXT: s_cbranch_execz BB23_2 4857; GFX1032-NEXT: ; %bb.1: 4858; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 4859; GFX1032-NEXT: v_mov_b32_e32 v4, s4 4860; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4861; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4862; GFX1032-NEXT: ds_min_rtn_u32 v0, v7, v4 4863; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4864; GFX1032-NEXT: buffer_gl0_inv 4865; GFX1032-NEXT: buffer_gl1_inv 4866; GFX1032-NEXT: BB23_2: 4867; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 4868; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 4869; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 4870; GFX1032-NEXT: v_mov_b32_e32 v0, v3 4871; GFX1032-NEXT: v_min_u32_e32 v0, s3, v0 4872; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4873; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4874; GFX1032-NEXT: s_nop 0 4875; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 4876; GFX1032-NEXT: s_endpgm 4877entry: 4878 %lane = call i32 @llvm.amdgcn.workitem.id.x() 4879 %old = atomicrmw umin i32 addrspace(3)* @local_var32, i32 %lane acq_rel 4880 store i32 %old, i32 addrspace(1)* %out 4881 ret void 4882} 4883 4884define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { 4885; 4886; 4887; GFX7LESS-LABEL: umin_i64_constant: 4888; GFX7LESS: ; %bb.0: ; %entry 4889; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4890; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4891; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 4892; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4893; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 4894; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 4895; GFX7LESS-NEXT: s_cbranch_execz BB24_2 4896; GFX7LESS-NEXT: ; %bb.1: 4897; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4898; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 4899; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4900; GFX7LESS-NEXT: s_mov_b32 m0, -1 4901; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4902; GFX7LESS-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 4903; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4904; GFX7LESS-NEXT: buffer_wbinvl1 4905; GFX7LESS-NEXT: BB24_2: 4906; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 4907; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 4908; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 4909; GFX7LESS-NEXT: s_mov_b32 s2, -1 4910; GFX7LESS-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 4911; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4912; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 4913; GFX7LESS-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 4914; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4915; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 4916; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4917; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4918; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4919; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4920; GFX7LESS-NEXT: s_endpgm 4921; 4922; GFX8-LABEL: umin_i64_constant: 4923; GFX8: ; %bb.0: ; %entry 4924; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4925; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4926; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4927; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4928; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 4929; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 4930; GFX8-NEXT: s_cbranch_execz BB24_2 4931; GFX8-NEXT: ; %bb.1: 4932; GFX8-NEXT: v_mov_b32_e32 v0, 5 4933; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4934; GFX8-NEXT: v_mov_b32_e32 v1, 0 4935; GFX8-NEXT: s_mov_b32 m0, -1 4936; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4937; GFX8-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 4938; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4939; GFX8-NEXT: buffer_wbinvl1_vol 4940; GFX8-NEXT: BB24_2: 4941; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 4942; GFX8-NEXT: v_readfirstlane_b32 s5, v1 4943; GFX8-NEXT: v_readfirstlane_b32 s4, v0 4944; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 4945; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4946; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 4947; GFX8-NEXT: v_mov_b32_e32 v2, s5 4948; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4949; GFX8-NEXT: v_mov_b32_e32 v2, s4 4950; GFX8-NEXT: s_mov_b32 s2, -1 4951; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4952; GFX8-NEXT: s_mov_b32 s3, 0xf000 4953; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4954; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4955; GFX8-NEXT: s_endpgm 4956; 4957; GFX9-LABEL: umin_i64_constant: 4958; GFX9: ; %bb.0: ; %entry 4959; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4960; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4961; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4962; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4963; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 4964; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4965; GFX9-NEXT: s_cbranch_execz BB24_2 4966; GFX9-NEXT: ; %bb.1: 4967; GFX9-NEXT: v_mov_b32_e32 v0, 5 4968; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4969; GFX9-NEXT: v_mov_b32_e32 v1, 0 4970; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4971; GFX9-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 4972; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4973; GFX9-NEXT: buffer_wbinvl1_vol 4974; GFX9-NEXT: BB24_2: 4975; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4976; GFX9-NEXT: v_readfirstlane_b32 s5, v1 4977; GFX9-NEXT: v_readfirstlane_b32 s4, v0 4978; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 4979; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4980; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 4981; GFX9-NEXT: v_mov_b32_e32 v2, s5 4982; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4983; GFX9-NEXT: v_mov_b32_e32 v2, s4 4984; GFX9-NEXT: s_mov_b32 s2, -1 4985; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4986; GFX9-NEXT: s_mov_b32 s3, 0xf000 4987; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4988; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4989; GFX9-NEXT: s_endpgm 4990; 4991; GFX1064-LABEL: umin_i64_constant: 4992; GFX1064: ; %bb.0: ; %entry 4993; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4994; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4995; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 4996; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4997; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 4998; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 4999; GFX1064-NEXT: s_cbranch_execz BB24_2 5000; GFX1064-NEXT: ; %bb.1: 5001; GFX1064-NEXT: v_mov_b32_e32 v0, 5 5002; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 5003; GFX1064-NEXT: v_mov_b32_e32 v1, 0 5004; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5005; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 5006; GFX1064-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 5007; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5008; GFX1064-NEXT: buffer_gl0_inv 5009; GFX1064-NEXT: buffer_gl1_inv 5010; GFX1064-NEXT: BB24_2: 5011; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 5012; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 5013; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 5014; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 5015; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 5016; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 5017; GFX1064-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] 5018; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 5019; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 5020; GFX1064-NEXT: s_mov_b32 s2, -1 5021; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 5022; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 5023; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5024; GFX1064-NEXT: s_endpgm 5025; 5026; GFX1032-LABEL: umin_i64_constant: 5027; GFX1032: ; %bb.0: ; %entry 5028; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5029; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 5030; GFX1032-NEXT: ; implicit-def: $vcc_hi 5031; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 5032; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 5033; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 5034; GFX1032-NEXT: s_cbranch_execz BB24_2 5035; GFX1032-NEXT: ; %bb.1: 5036; GFX1032-NEXT: v_mov_b32_e32 v0, 5 5037; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 5038; GFX1032-NEXT: v_mov_b32_e32 v1, 0 5039; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5040; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 5041; GFX1032-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 5042; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5043; GFX1032-NEXT: buffer_gl0_inv 5044; GFX1032-NEXT: buffer_gl1_inv 5045; GFX1032-NEXT: BB24_2: 5046; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 5047; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 5048; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 5049; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 5050; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo 5051; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo 5052; GFX1032-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1] 5053; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 5054; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 5055; GFX1032-NEXT: s_mov_b32 s2, -1 5056; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 5057; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 5058; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5059; GFX1032-NEXT: s_endpgm 5060entry: 5061 %old = atomicrmw umin i64 addrspace(3)* @local_var64, i64 5 acq_rel 5062 store i64 %old, i64 addrspace(1)* %out 5063 ret void 5064} 5065