1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --force-update 2; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS %s 3; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s 4; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s 5; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1064 %s 6; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1032 %s 7 8declare i32 @llvm.amdgcn.workitem.id.x() 9 10@local_var32 = addrspace(3) global i32 undef, align 4 11@local_var64 = addrspace(3) global i64 undef, align 8 12 13; Show what the atomic optimization pass will do for local pointers. 14 15define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) { 16; 17; 18; GFX7LESS-LABEL: add_i32_constant: 19; GFX7LESS: ; %bb.0: ; %entry 20; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 21; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 22; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 23; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 24; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 25; GFX7LESS-NEXT: ; implicit-def: $vgpr1 26; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 27; GFX7LESS-NEXT: s_cbranch_execz BB0_2 28; GFX7LESS-NEXT: ; %bb.1: 29; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 30; GFX7LESS-NEXT: s_mul_i32 s2, s2, 5 31; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 32; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 33; GFX7LESS-NEXT: s_mov_b32 m0, -1 34; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 35; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2 36; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 37; GFX7LESS-NEXT: BB0_2: 38; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 39; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 40; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 41; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 42; GFX7LESS-NEXT: v_mad_u32_u24 v0, v0, 5, s2 43; GFX7LESS-NEXT: s_mov_b32 s2, -1 44; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 45; GFX7LESS-NEXT: s_endpgm 46; 47; GFX8-LABEL: add_i32_constant: 48; GFX8: ; %bb.0: ; %entry 49; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 50; GFX8-NEXT: s_mov_b64 s[2:3], exec 51; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 52; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 53; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 54; GFX8-NEXT: ; implicit-def: $vgpr1 55; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 56; GFX8-NEXT: s_cbranch_execz BB0_2 57; GFX8-NEXT: ; %bb.1: 58; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 59; GFX8-NEXT: s_mul_i32 s2, s2, 5 60; GFX8-NEXT: v_mov_b32_e32 v1, 0 61; GFX8-NEXT: v_mov_b32_e32 v2, s2 62; GFX8-NEXT: s_mov_b32 m0, -1 63; GFX8-NEXT: s_waitcnt lgkmcnt(0) 64; GFX8-NEXT: ds_add_rtn_u32 v1, v1, v2 65; GFX8-NEXT: s_waitcnt lgkmcnt(0) 66; GFX8-NEXT: BB0_2: 67; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 68; GFX8-NEXT: s_waitcnt lgkmcnt(0) 69; GFX8-NEXT: v_readfirstlane_b32 s2, v1 70; GFX8-NEXT: v_mad_u32_u24 v0, v0, 5, s2 71; GFX8-NEXT: s_mov_b32 s3, 0xf000 72; GFX8-NEXT: s_mov_b32 s2, -1 73; GFX8-NEXT: s_nop 1 74; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 75; GFX8-NEXT: s_endpgm 76; 77; GFX9-LABEL: add_i32_constant: 78; GFX9: ; %bb.0: ; %entry 79; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 80; GFX9-NEXT: s_mov_b64 s[2:3], exec 81; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 82; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 83; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 84; GFX9-NEXT: ; implicit-def: $vgpr1 85; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 86; GFX9-NEXT: s_cbranch_execz BB0_2 87; GFX9-NEXT: ; %bb.1: 88; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 89; GFX9-NEXT: s_mul_i32 s2, s2, 5 90; GFX9-NEXT: v_mov_b32_e32 v1, 0 91; GFX9-NEXT: v_mov_b32_e32 v2, s2 92; GFX9-NEXT: s_waitcnt lgkmcnt(0) 93; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 94; GFX9-NEXT: s_waitcnt lgkmcnt(0) 95; GFX9-NEXT: BB0_2: 96; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 97; GFX9-NEXT: s_waitcnt lgkmcnt(0) 98; GFX9-NEXT: v_readfirstlane_b32 s2, v1 99; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s2 100; GFX9-NEXT: s_mov_b32 s3, 0xf000 101; GFX9-NEXT: s_mov_b32 s2, -1 102; GFX9-NEXT: s_nop 1 103; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 104; GFX9-NEXT: s_endpgm 105; 106; GFX1064-LABEL: add_i32_constant: 107; GFX1064: ; %bb.0: ; %entry 108; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 109; GFX1064-NEXT: s_mov_b64 s[2:3], exec 110; GFX1064-NEXT: ; implicit-def: $vgpr1 111; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 112; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 113; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 114; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 115; GFX1064-NEXT: s_cbranch_execz BB0_2 116; GFX1064-NEXT: ; %bb.1: 117; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 118; GFX1064-NEXT: v_mov_b32_e32 v1, 0 119; GFX1064-NEXT: s_mul_i32 s2, s2, 5 120; GFX1064-NEXT: v_mov_b32_e32 v2, s2 121; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 122; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 123; GFX1064-NEXT: ds_add_rtn_u32 v1, v1, v2 124; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 125; GFX1064-NEXT: buffer_gl0_inv 126; GFX1064-NEXT: BB0_2: 127; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 128; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 129; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 130; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 131; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s2 132; GFX1064-NEXT: s_mov_b32 s2, -1 133; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 134; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 135; GFX1064-NEXT: s_endpgm 136; 137; GFX1032-LABEL: add_i32_constant: 138; GFX1032: ; %bb.0: ; %entry 139; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 140; GFX1032-NEXT: s_mov_b32 s3, exec_lo 141; GFX1032-NEXT: ; implicit-def: $vgpr1 142; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 143; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 144; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 145; GFX1032-NEXT: s_cbranch_execz BB0_2 146; GFX1032-NEXT: ; %bb.1: 147; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 148; GFX1032-NEXT: v_mov_b32_e32 v1, 0 149; GFX1032-NEXT: s_mul_i32 s3, s3, 5 150; GFX1032-NEXT: v_mov_b32_e32 v2, s3 151; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 152; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 153; GFX1032-NEXT: ds_add_rtn_u32 v1, v1, v2 154; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 155; GFX1032-NEXT: buffer_gl0_inv 156; GFX1032-NEXT: BB0_2: 157; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 158; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 159; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 160; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 161; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s2 162; GFX1032-NEXT: s_mov_b32 s2, -1 163; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 164; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 165; GFX1032-NEXT: s_endpgm 166entry: 167 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 5 acq_rel 168 store i32 %old, i32 addrspace(1)* %out 169 ret void 170} 171 172define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive) { 173; 174; 175; GFX7LESS-LABEL: add_i32_uniform: 176; GFX7LESS: ; %bb.0: ; %entry 177; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 178; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 179; GFX7LESS-NEXT: s_load_dword s0, s[0:1], 0xb 180; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 181; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 182; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 183; GFX7LESS-NEXT: ; implicit-def: $vgpr1 184; GFX7LESS-NEXT: s_and_saveexec_b64 s[6:7], vcc 185; GFX7LESS-NEXT: s_cbranch_execz BB1_2 186; GFX7LESS-NEXT: ; %bb.1: 187; GFX7LESS-NEXT: s_bcnt1_i32_b64 s1, s[2:3] 188; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 189; GFX7LESS-NEXT: s_mul_i32 s1, s0, s1 190; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 191; GFX7LESS-NEXT: v_mov_b32_e32 v2, s1 192; GFX7LESS-NEXT: s_mov_b32 m0, -1 193; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 194; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2 195; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 196; GFX7LESS-NEXT: BB1_2: 197; GFX7LESS-NEXT: s_or_b64 exec, exec, s[6:7] 198; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 199; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v1 200; GFX7LESS-NEXT: v_mul_lo_u32 v0, s0, v0 201; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 202; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s1, v0 203; GFX7LESS-NEXT: s_mov_b32 s6, -1 204; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 205; GFX7LESS-NEXT: s_endpgm 206; 207; GFX8-LABEL: add_i32_uniform: 208; GFX8: ; %bb.0: ; %entry 209; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 210; GFX8-NEXT: s_load_dword s0, s[0:1], 0x2c 211; GFX8-NEXT: s_mov_b64 s[2:3], exec 212; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 213; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 214; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 215; GFX8-NEXT: ; implicit-def: $vgpr1 216; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc 217; GFX8-NEXT: s_cbranch_execz BB1_2 218; GFX8-NEXT: ; %bb.1: 219; GFX8-NEXT: s_bcnt1_i32_b64 s1, s[2:3] 220; GFX8-NEXT: s_waitcnt lgkmcnt(0) 221; GFX8-NEXT: s_mul_i32 s1, s0, s1 222; GFX8-NEXT: v_mov_b32_e32 v1, 0 223; GFX8-NEXT: v_mov_b32_e32 v2, s1 224; GFX8-NEXT: s_mov_b32 m0, -1 225; GFX8-NEXT: s_waitcnt lgkmcnt(0) 226; GFX8-NEXT: ds_add_rtn_u32 v1, v1, v2 227; GFX8-NEXT: s_waitcnt lgkmcnt(0) 228; GFX8-NEXT: BB1_2: 229; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 230; GFX8-NEXT: s_waitcnt lgkmcnt(0) 231; GFX8-NEXT: v_mul_lo_u32 v0, s0, v0 232; GFX8-NEXT: v_readfirstlane_b32 s0, v1 233; GFX8-NEXT: s_mov_b32 s7, 0xf000 234; GFX8-NEXT: s_mov_b32 s6, -1 235; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 236; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 237; GFX8-NEXT: s_endpgm 238; 239; GFX9-LABEL: add_i32_uniform: 240; GFX9: ; %bb.0: ; %entry 241; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 242; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c 243; GFX9-NEXT: s_mov_b64 s[6:7], exec 244; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 245; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 246; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 247; GFX9-NEXT: ; implicit-def: $vgpr1 248; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 249; GFX9-NEXT: s_cbranch_execz BB1_2 250; GFX9-NEXT: ; %bb.1: 251; GFX9-NEXT: s_bcnt1_i32_b64 s3, s[6:7] 252; GFX9-NEXT: s_waitcnt lgkmcnt(0) 253; GFX9-NEXT: s_mul_i32 s3, s2, s3 254; GFX9-NEXT: v_mov_b32_e32 v1, 0 255; GFX9-NEXT: v_mov_b32_e32 v2, s3 256; GFX9-NEXT: s_waitcnt lgkmcnt(0) 257; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 258; GFX9-NEXT: s_waitcnt lgkmcnt(0) 259; GFX9-NEXT: BB1_2: 260; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 261; GFX9-NEXT: s_waitcnt lgkmcnt(0) 262; GFX9-NEXT: v_mul_lo_u32 v0, s2, v0 263; GFX9-NEXT: v_readfirstlane_b32 s0, v1 264; GFX9-NEXT: s_mov_b32 s7, 0xf000 265; GFX9-NEXT: s_mov_b32 s6, -1 266; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 267; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 268; GFX9-NEXT: s_endpgm 269; 270; GFX1064-LABEL: add_i32_uniform: 271; GFX1064: ; %bb.0: ; %entry 272; GFX1064-NEXT: s_clause 0x1 273; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 274; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x2c 275; GFX1064-NEXT: s_mov_b64 s[6:7], exec 276; GFX1064-NEXT: ; implicit-def: $vgpr1 277; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 278; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 279; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 280; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc 281; GFX1064-NEXT: s_cbranch_execz BB1_2 282; GFX1064-NEXT: ; %bb.1: 283; GFX1064-NEXT: s_bcnt1_i32_b64 s3, s[6:7] 284; GFX1064-NEXT: v_mov_b32_e32 v1, 0 285; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 286; GFX1064-NEXT: s_mul_i32 s3, s2, s3 287; GFX1064-NEXT: v_mov_b32_e32 v2, s3 288; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 289; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 290; GFX1064-NEXT: ds_add_rtn_u32 v1, v1, v2 291; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 292; GFX1064-NEXT: buffer_gl0_inv 293; GFX1064-NEXT: BB1_2: 294; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 295; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] 296; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 297; GFX1064-NEXT: v_mul_lo_u32 v0, s2, v0 298; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 299; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 300; GFX1064-NEXT: s_mov_b32 s6, -1 301; GFX1064-NEXT: v_add_nc_u32_e32 v0, s0, v0 302; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0 303; GFX1064-NEXT: s_endpgm 304; 305; GFX1032-LABEL: add_i32_uniform: 306; GFX1032: ; %bb.0: ; %entry 307; GFX1032-NEXT: s_clause 0x1 308; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 309; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x2c 310; GFX1032-NEXT: s_mov_b32 s3, exec_lo 311; GFX1032-NEXT: ; implicit-def: $vgpr1 312; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 313; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 314; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 315; GFX1032-NEXT: s_cbranch_execz BB1_2 316; GFX1032-NEXT: ; %bb.1: 317; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s3 318; GFX1032-NEXT: v_mov_b32_e32 v1, 0 319; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 320; GFX1032-NEXT: s_mul_i32 s1, s2, s1 321; GFX1032-NEXT: v_mov_b32_e32 v2, s1 322; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 323; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 324; GFX1032-NEXT: ds_add_rtn_u32 v1, v1, v2 325; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 326; GFX1032-NEXT: buffer_gl0_inv 327; GFX1032-NEXT: BB1_2: 328; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 329; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 330; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 331; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 332; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 333; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 334; GFX1032-NEXT: s_mov_b32 s6, -1 335; GFX1032-NEXT: v_add_nc_u32_e32 v0, s0, v0 336; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 337; GFX1032-NEXT: s_endpgm 338entry: 339 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %additive acq_rel 340 store i32 %old, i32 addrspace(1)* %out 341 ret void 342} 343 344define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) { 345; 346; 347; GFX7LESS-LABEL: add_i32_varying: 348; GFX7LESS: ; %bb.0: ; %entry 349; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 350; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 351; GFX7LESS-NEXT: s_mov_b32 m0, -1 352; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 353; GFX7LESS-NEXT: ds_add_rtn_u32 v0, v1, v0 354; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 355; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 356; GFX7LESS-NEXT: s_mov_b32 s2, -1 357; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 358; GFX7LESS-NEXT: s_endpgm 359; 360; GFX8-LABEL: add_i32_varying: 361; GFX8: ; %bb.0: ; %entry 362; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 363; GFX8-NEXT: v_mov_b32_e32 v2, v0 364; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 365; GFX8-NEXT: v_mov_b32_e32 v1, 0 366; GFX8-NEXT: s_mov_b64 exec, s[2:3] 367; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 368; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 369; GFX8-NEXT: s_not_b64 exec, exec 370; GFX8-NEXT: v_mov_b32_e32 v2, 0 371; GFX8-NEXT: s_not_b64 exec, exec 372; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 373; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 374; GFX8-NEXT: s_nop 1 375; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 376; GFX8-NEXT: s_nop 1 377; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 378; GFX8-NEXT: s_nop 1 379; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 380; GFX8-NEXT: s_nop 1 381; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 382; GFX8-NEXT: s_nop 1 383; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 384; GFX8-NEXT: v_readlane_b32 s4, v2, 63 385; GFX8-NEXT: s_nop 0 386; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 387; GFX8-NEXT: s_mov_b64 exec, s[2:3] 388; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 389; GFX8-NEXT: ; implicit-def: $vgpr0 390; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 391; GFX8-NEXT: s_cbranch_execz BB2_2 392; GFX8-NEXT: ; %bb.1: 393; GFX8-NEXT: v_mov_b32_e32 v0, 0 394; GFX8-NEXT: v_mov_b32_e32 v3, s4 395; GFX8-NEXT: s_mov_b32 m0, -1 396; GFX8-NEXT: s_waitcnt lgkmcnt(0) 397; GFX8-NEXT: ds_add_rtn_u32 v0, v0, v3 398; GFX8-NEXT: s_waitcnt lgkmcnt(0) 399; GFX8-NEXT: BB2_2: 400; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 401; GFX8-NEXT: s_waitcnt lgkmcnt(0) 402; GFX8-NEXT: v_readfirstlane_b32 s2, v0 403; GFX8-NEXT: v_mov_b32_e32 v0, v1 404; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 405; GFX8-NEXT: s_mov_b32 s3, 0xf000 406; GFX8-NEXT: s_mov_b32 s2, -1 407; GFX8-NEXT: s_nop 0 408; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 409; GFX8-NEXT: s_endpgm 410; 411; GFX9-LABEL: add_i32_varying: 412; GFX9: ; %bb.0: ; %entry 413; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 414; GFX9-NEXT: v_mov_b32_e32 v2, v0 415; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 416; GFX9-NEXT: v_mov_b32_e32 v1, 0 417; GFX9-NEXT: s_mov_b64 exec, s[2:3] 418; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 419; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 420; GFX9-NEXT: s_not_b64 exec, exec 421; GFX9-NEXT: v_mov_b32_e32 v2, 0 422; GFX9-NEXT: s_not_b64 exec, exec 423; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 424; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 425; GFX9-NEXT: s_nop 1 426; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 427; GFX9-NEXT: s_nop 1 428; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 429; GFX9-NEXT: s_nop 1 430; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 431; GFX9-NEXT: s_nop 1 432; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 433; GFX9-NEXT: s_nop 1 434; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 435; GFX9-NEXT: v_readlane_b32 s4, v2, 63 436; GFX9-NEXT: s_nop 0 437; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 438; GFX9-NEXT: s_mov_b64 exec, s[2:3] 439; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 440; GFX9-NEXT: ; implicit-def: $vgpr0 441; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 442; GFX9-NEXT: s_cbranch_execz BB2_2 443; GFX9-NEXT: ; %bb.1: 444; GFX9-NEXT: v_mov_b32_e32 v0, 0 445; GFX9-NEXT: v_mov_b32_e32 v3, s4 446; GFX9-NEXT: s_waitcnt lgkmcnt(0) 447; GFX9-NEXT: ds_add_rtn_u32 v0, v0, v3 448; GFX9-NEXT: s_waitcnt lgkmcnt(0) 449; GFX9-NEXT: BB2_2: 450; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 451; GFX9-NEXT: s_waitcnt lgkmcnt(0) 452; GFX9-NEXT: v_readfirstlane_b32 s2, v0 453; GFX9-NEXT: v_mov_b32_e32 v0, v1 454; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 455; GFX9-NEXT: s_mov_b32 s3, 0xf000 456; GFX9-NEXT: s_mov_b32 s2, -1 457; GFX9-NEXT: s_nop 0 458; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 459; GFX9-NEXT: s_endpgm 460; 461; GFX1064-LABEL: add_i32_varying: 462; GFX1064: ; %bb.0: ; %entry 463; GFX1064-NEXT: v_mov_b32_e32 v1, v0 464; GFX1064-NEXT: s_not_b64 exec, exec 465; GFX1064-NEXT: v_mov_b32_e32 v1, 0 466; GFX1064-NEXT: s_not_b64 exec, exec 467; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 468; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 469; GFX1064-NEXT: v_mov_b32_e32 v3, 0 470; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 471; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 472; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 473; GFX1064-NEXT: v_mov_b32_e32 v2, v1 474; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 475; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 476; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 477; GFX1064-NEXT: v_mov_b32_e32 v2, s4 478; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 479; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 480; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 481; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 482; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 483; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 484; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 485; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 486; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 487; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 488; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 489; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 490; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 491; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 492; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 493; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 494; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 495; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 496; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 497; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 498; GFX1064-NEXT: s_mov_b32 s2, -1 499; GFX1064-NEXT: ; implicit-def: $vgpr0 500; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 501; GFX1064-NEXT: s_cbranch_execz BB2_2 502; GFX1064-NEXT: ; %bb.1: 503; GFX1064-NEXT: v_mov_b32_e32 v0, 0 504; GFX1064-NEXT: v_mov_b32_e32 v4, s7 505; GFX1064-NEXT: s_mov_b32 s3, s7 506; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 507; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 508; GFX1064-NEXT: ds_add_rtn_u32 v0, v0, v4 509; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 510; GFX1064-NEXT: buffer_gl0_inv 511; GFX1064-NEXT: BB2_2: 512; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 513; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 514; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 515; GFX1064-NEXT: v_mov_b32_e32 v0, v3 516; GFX1064-NEXT: v_add_nc_u32_e32 v0, s3, v0 517; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 518; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 519; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 520; GFX1064-NEXT: s_endpgm 521; 522; GFX1032-LABEL: add_i32_varying: 523; GFX1032: ; %bb.0: ; %entry 524; GFX1032-NEXT: v_mov_b32_e32 v1, v0 525; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 526; GFX1032-NEXT: v_mov_b32_e32 v1, 0 527; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 528; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 529; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 530; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 531; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 532; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 533; GFX1032-NEXT: v_mov_b32_e32 v2, v1 534; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 535; GFX1032-NEXT: s_mov_b32 exec_lo, s2 536; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 537; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 538; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 539; GFX1032-NEXT: v_mov_b32_e32 v3, 0 540; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 541; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 542; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 543; GFX1032-NEXT: s_mov_b32 exec_lo, s2 544; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 545; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 546; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 547; GFX1032-NEXT: s_mov_b32 exec_lo, s2 548; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 549; GFX1032-NEXT: s_mov_b32 s2, -1 550; GFX1032-NEXT: ; implicit-def: $vgpr0 551; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 552; GFX1032-NEXT: s_cbranch_execz BB2_2 553; GFX1032-NEXT: ; %bb.1: 554; GFX1032-NEXT: v_mov_b32_e32 v0, 0 555; GFX1032-NEXT: v_mov_b32_e32 v4, s4 556; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 557; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 558; GFX1032-NEXT: ds_add_rtn_u32 v0, v0, v4 559; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 560; GFX1032-NEXT: buffer_gl0_inv 561; GFX1032-NEXT: BB2_2: 562; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 563; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 564; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 565; GFX1032-NEXT: v_mov_b32_e32 v0, v3 566; GFX1032-NEXT: v_add_nc_u32_e32 v0, s3, v0 567; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 568; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 569; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 570; GFX1032-NEXT: s_endpgm 571entry: 572 %lane = call i32 @llvm.amdgcn.workitem.id.x() 573 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel 574 store i32 %old, i32 addrspace(1)* %out 575 ret void 576} 577 578define amdgpu_kernel void @add_i32_varying_nouse() { 579; GFX7LESS-LABEL: add_i32_varying_nouse: 580; GFX7LESS: ; %bb.0: ; %entry 581; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 582; GFX7LESS-NEXT: s_mov_b32 m0, -1 583; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 584; GFX7LESS-NEXT: ds_add_u32 v1, v0 585; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 586; GFX7LESS-NEXT: s_endpgm 587; 588; GFX8-LABEL: add_i32_varying_nouse: 589; GFX8: ; %bb.0: ; %entry 590; GFX8-NEXT: v_mov_b32_e32 v1, v0 591; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 592; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 593; GFX8-NEXT: s_not_b64 exec, exec 594; GFX8-NEXT: v_mov_b32_e32 v1, 0 595; GFX8-NEXT: s_not_b64 exec, exec 596; GFX8-NEXT: s_or_saveexec_b64 s[0:1], -1 597; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 598; GFX8-NEXT: s_nop 1 599; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 600; GFX8-NEXT: s_nop 1 601; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 602; GFX8-NEXT: s_nop 1 603; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 604; GFX8-NEXT: s_nop 1 605; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 606; GFX8-NEXT: s_nop 1 607; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 608; GFX8-NEXT: v_readlane_b32 s2, v1, 63 609; GFX8-NEXT: s_mov_b64 exec, s[0:1] 610; GFX8-NEXT: s_mov_b32 s0, s2 611; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 612; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 613; GFX8-NEXT: s_cbranch_execz BB3_2 614; GFX8-NEXT: ; %bb.1: 615; GFX8-NEXT: v_mov_b32_e32 v0, 0 616; GFX8-NEXT: v_mov_b32_e32 v2, s0 617; GFX8-NEXT: s_mov_b32 m0, -1 618; GFX8-NEXT: s_waitcnt lgkmcnt(0) 619; GFX8-NEXT: ds_add_u32 v0, v2 620; GFX8-NEXT: s_waitcnt lgkmcnt(0) 621; GFX8-NEXT: BB3_2: 622; GFX8-NEXT: s_endpgm 623; 624; GFX9-LABEL: add_i32_varying_nouse: 625; GFX9: ; %bb.0: ; %entry 626; GFX9-NEXT: v_mov_b32_e32 v1, v0 627; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 628; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 629; GFX9-NEXT: s_not_b64 exec, exec 630; GFX9-NEXT: v_mov_b32_e32 v1, 0 631; GFX9-NEXT: s_not_b64 exec, exec 632; GFX9-NEXT: s_or_saveexec_b64 s[0:1], -1 633; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 634; GFX9-NEXT: s_nop 1 635; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 636; GFX9-NEXT: s_nop 1 637; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 638; GFX9-NEXT: s_nop 1 639; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 640; GFX9-NEXT: s_nop 1 641; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 642; GFX9-NEXT: s_nop 1 643; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 644; GFX9-NEXT: v_readlane_b32 s2, v1, 63 645; GFX9-NEXT: s_mov_b64 exec, s[0:1] 646; GFX9-NEXT: s_mov_b32 s0, s2 647; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 648; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 649; GFX9-NEXT: s_cbranch_execz BB3_2 650; GFX9-NEXT: ; %bb.1: 651; GFX9-NEXT: v_mov_b32_e32 v0, 0 652; GFX9-NEXT: v_mov_b32_e32 v2, s0 653; GFX9-NEXT: s_waitcnt lgkmcnt(0) 654; GFX9-NEXT: ds_add_u32 v0, v2 655; GFX9-NEXT: s_waitcnt lgkmcnt(0) 656; GFX9-NEXT: BB3_2: 657; GFX9-NEXT: s_endpgm 658; 659; GFX1064-LABEL: add_i32_varying_nouse: 660; GFX1064: ; %bb.0: ; %entry 661; GFX1064-NEXT: v_mov_b32_e32 v1, v0 662; GFX1064-NEXT: s_not_b64 exec, exec 663; GFX1064-NEXT: v_mov_b32_e32 v1, 0 664; GFX1064-NEXT: s_not_b64 exec, exec 665; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 666; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 667; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 668; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 669; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 670; GFX1064-NEXT: v_mov_b32_e32 v2, v1 671; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 672; GFX1064-NEXT: v_add_nc_u32_e32 v1, v1, v2 673; GFX1064-NEXT: s_mov_b64 exec, s[0:1] 674; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 675; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 676; GFX1064-NEXT: v_readlane_b32 s2, v1, 0 677; GFX1064-NEXT: v_readlane_b32 s3, v1, 32 678; GFX1064-NEXT: s_mov_b64 exec, s[0:1] 679; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 680; GFX1064-NEXT: s_add_i32 s0, s2, s3 681; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 682; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 683; GFX1064-NEXT: s_cbranch_execz BB3_2 684; GFX1064-NEXT: ; %bb.1: 685; GFX1064-NEXT: v_mov_b32_e32 v0, 0 686; GFX1064-NEXT: v_mov_b32_e32 v3, s0 687; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 688; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 689; GFX1064-NEXT: ds_add_u32 v0, v3 690; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 691; GFX1064-NEXT: buffer_gl0_inv 692; GFX1064-NEXT: BB3_2: 693; GFX1064-NEXT: s_endpgm 694; 695; GFX1032-LABEL: add_i32_varying_nouse: 696; GFX1032: ; %bb.0: ; %entry 697; GFX1032-NEXT: v_mov_b32_e32 v1, v0 698; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 699; GFX1032-NEXT: v_mov_b32_e32 v1, 0 700; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 701; GFX1032-NEXT: s_or_saveexec_b32 s0, -1 702; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 703; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 704; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 705; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 706; GFX1032-NEXT: v_mov_b32_e32 v2, v1 707; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 708; GFX1032-NEXT: v_add_nc_u32_e32 v1, v1, v2 709; GFX1032-NEXT: s_mov_b32 exec_lo, s0 710; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 711; GFX1032-NEXT: v_mov_b32_e32 v0, v1 712; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 713; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 714; GFX1032-NEXT: s_cbranch_execz BB3_2 715; GFX1032-NEXT: ; %bb.1: 716; GFX1032-NEXT: v_mov_b32_e32 v3, 0 717; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 718; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 719; GFX1032-NEXT: ds_add_u32 v3, v0 720; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 721; GFX1032-NEXT: buffer_gl0_inv 722; GFX1032-NEXT: BB3_2: 723; GFX1032-NEXT: s_endpgm 724entry: 725 %lane = call i32 @llvm.amdgcn.workitem.id.x() 726 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel 727 ret void 728} 729 730define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { 731; 732; 733; GFX7LESS-LABEL: add_i64_constant: 734; GFX7LESS: ; %bb.0: ; %entry 735; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec 736; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 737; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 738; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 739; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 740; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 741; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 742; GFX7LESS-NEXT: s_cbranch_execz BB4_2 743; GFX7LESS-NEXT: ; %bb.1: 744; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 745; GFX7LESS-NEXT: s_mul_i32 s4, s4, 5 746; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 747; GFX7LESS-NEXT: v_mov_b32_e32 v1, s4 748; GFX7LESS-NEXT: s_mov_b32 m0, -1 749; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 750; GFX7LESS-NEXT: ds_add_rtn_u64 v[1:2], v2, v[1:2] 751; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 752; GFX7LESS-NEXT: BB4_2: 753; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 754; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 755; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 756; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v2 757; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 758; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0 759; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 760; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 761; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s2, v0 762; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc 763; GFX7LESS-NEXT: s_mov_b32 s2, -1 764; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 765; GFX7LESS-NEXT: s_endpgm 766; 767; GFX8-LABEL: add_i64_constant: 768; GFX8: ; %bb.0: ; %entry 769; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 770; GFX8-NEXT: s_mov_b64 s[4:5], exec 771; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 772; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 773; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 774; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 775; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 776; GFX8-NEXT: s_cbranch_execz BB4_2 777; GFX8-NEXT: ; %bb.1: 778; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 779; GFX8-NEXT: s_mul_i32 s4, s4, 5 780; GFX8-NEXT: v_mov_b32_e32 v1, s4 781; GFX8-NEXT: v_mov_b32_e32 v2, 0 782; GFX8-NEXT: s_mov_b32 m0, -1 783; GFX8-NEXT: s_waitcnt lgkmcnt(0) 784; GFX8-NEXT: ds_add_rtn_u64 v[1:2], v2, v[1:2] 785; GFX8-NEXT: s_waitcnt lgkmcnt(0) 786; GFX8-NEXT: BB4_2: 787; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 788; GFX8-NEXT: s_waitcnt lgkmcnt(0) 789; GFX8-NEXT: v_readfirstlane_b32 s2, v1 790; GFX8-NEXT: v_readfirstlane_b32 s3, v2 791; GFX8-NEXT: v_mov_b32_e32 v1, s2 792; GFX8-NEXT: v_mov_b32_e32 v2, s3 793; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v0, 5, v[1:2] 794; GFX8-NEXT: s_mov_b32 s3, 0xf000 795; GFX8-NEXT: s_mov_b32 s2, -1 796; GFX8-NEXT: s_nop 2 797; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 798; GFX8-NEXT: s_endpgm 799; 800; GFX9-LABEL: add_i64_constant: 801; GFX9: ; %bb.0: ; %entry 802; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 803; GFX9-NEXT: s_mov_b64 s[4:5], exec 804; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 805; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 806; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 807; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 808; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 809; GFX9-NEXT: s_cbranch_execz BB4_2 810; GFX9-NEXT: ; %bb.1: 811; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 812; GFX9-NEXT: s_mul_i32 s4, s4, 5 813; GFX9-NEXT: v_mov_b32_e32 v1, s4 814; GFX9-NEXT: v_mov_b32_e32 v2, 0 815; GFX9-NEXT: s_waitcnt lgkmcnt(0) 816; GFX9-NEXT: ds_add_rtn_u64 v[1:2], v2, v[1:2] 817; GFX9-NEXT: s_waitcnt lgkmcnt(0) 818; GFX9-NEXT: BB4_2: 819; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 820; GFX9-NEXT: s_waitcnt lgkmcnt(0) 821; GFX9-NEXT: v_readfirstlane_b32 s2, v1 822; GFX9-NEXT: v_readfirstlane_b32 s3, v2 823; GFX9-NEXT: v_mov_b32_e32 v1, s2 824; GFX9-NEXT: v_mov_b32_e32 v2, s3 825; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v0, 5, v[1:2] 826; GFX9-NEXT: s_mov_b32 s3, 0xf000 827; GFX9-NEXT: s_mov_b32 s2, -1 828; GFX9-NEXT: s_nop 2 829; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 830; GFX9-NEXT: s_endpgm 831; 832; GFX1064-LABEL: add_i64_constant: 833; GFX1064: ; %bb.0: ; %entry 834; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 835; GFX1064-NEXT: s_mov_b64 s[4:5], exec 836; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2 837; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 838; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 839; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 840; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 841; GFX1064-NEXT: s_cbranch_execz BB4_2 842; GFX1064-NEXT: ; %bb.1: 843; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 844; GFX1064-NEXT: v_mov_b32_e32 v2, 0 845; GFX1064-NEXT: s_mul_i32 s4, s4, 5 846; GFX1064-NEXT: v_mov_b32_e32 v1, s4 847; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 848; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 849; GFX1064-NEXT: ds_add_rtn_u64 v[1:2], v2, v[1:2] 850; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 851; GFX1064-NEXT: buffer_gl0_inv 852; GFX1064-NEXT: BB4_2: 853; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 854; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 855; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 856; GFX1064-NEXT: v_readfirstlane_b32 s3, v2 857; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v0, 5, s[2:3] 858; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 859; GFX1064-NEXT: s_mov_b32 s2, -1 860; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 861; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 862; GFX1064-NEXT: s_endpgm 863; 864; GFX1032-LABEL: add_i64_constant: 865; GFX1032: ; %bb.0: ; %entry 866; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 867; GFX1032-NEXT: s_mov_b32 s3, exec_lo 868; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2 869; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 870; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 871; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 872; GFX1032-NEXT: s_cbranch_execz BB4_2 873; GFX1032-NEXT: ; %bb.1: 874; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 875; GFX1032-NEXT: v_mov_b32_e32 v2, 0 876; GFX1032-NEXT: s_mul_i32 s3, s3, 5 877; GFX1032-NEXT: v_mov_b32_e32 v1, s3 878; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 879; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 880; GFX1032-NEXT: ds_add_rtn_u64 v[1:2], v2, v[1:2] 881; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 882; GFX1032-NEXT: buffer_gl0_inv 883; GFX1032-NEXT: BB4_2: 884; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 885; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 886; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 887; GFX1032-NEXT: v_readfirstlane_b32 s3, v2 888; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, v0, 5, s[2:3] 889; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 890; GFX1032-NEXT: s_mov_b32 s2, -1 891; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 892; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 893; GFX1032-NEXT: s_endpgm 894entry: 895 %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 5 acq_rel 896 store i64 %old, i64 addrspace(1)* %out 897 ret void 898} 899 900define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive) { 901; 902; 903; GFX7LESS-LABEL: add_i64_uniform: 904; GFX7LESS: ; %bb.0: ; %entry 905; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 906; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 907; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 908; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 909; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 910; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 911; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 912; GFX7LESS-NEXT: s_cbranch_execz BB5_2 913; GFX7LESS-NEXT: ; %bb.1: 914; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 915; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0 916; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 917; GFX7LESS-NEXT: s_mul_i32 s7, s3, s6 918; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 919; GFX7LESS-NEXT: v_mul_hi_u32 v1, s2, v1 920; GFX7LESS-NEXT: s_mul_i32 s6, s2, s6 921; GFX7LESS-NEXT: v_add_i32_e32 v2, vcc, s7, v1 922; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 923; GFX7LESS-NEXT: s_mov_b32 m0, -1 924; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 925; GFX7LESS-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 926; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 927; GFX7LESS-NEXT: BB5_2: 928; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 929; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 930; GFX7LESS-NEXT: s_mov_b32 s6, -1 931; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 932; GFX7LESS-NEXT: s_mov_b32 s4, s0 933; GFX7LESS-NEXT: s_mov_b32 s5, s1 934; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 935; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v2 936; GFX7LESS-NEXT: v_mul_lo_u32 v1, s3, v0 937; GFX7LESS-NEXT: v_mul_hi_u32 v2, s2, v0 938; GFX7LESS-NEXT: v_mul_lo_u32 v0, s2, v0 939; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v2, v1 940; GFX7LESS-NEXT: v_mov_b32_e32 v2, s1 941; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s0, v0 942; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc 943; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 944; GFX7LESS-NEXT: s_endpgm 945; 946; GFX8-LABEL: add_i64_uniform: 947; GFX8: ; %bb.0: ; %entry 948; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 949; GFX8-NEXT: s_mov_b64 s[6:7], exec 950; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 951; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 952; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 953; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 954; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 955; GFX8-NEXT: s_cbranch_execz BB5_2 956; GFX8-NEXT: ; %bb.1: 957; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 958; GFX8-NEXT: v_mov_b32_e32 v1, s6 959; GFX8-NEXT: s_waitcnt lgkmcnt(0) 960; GFX8-NEXT: v_mul_hi_u32 v1, s2, v1 961; GFX8-NEXT: s_mul_i32 s7, s3, s6 962; GFX8-NEXT: s_mul_i32 s6, s2, s6 963; GFX8-NEXT: v_mov_b32_e32 v3, 0 964; GFX8-NEXT: v_add_u32_e32 v2, vcc, s7, v1 965; GFX8-NEXT: v_mov_b32_e32 v1, s6 966; GFX8-NEXT: s_mov_b32 m0, -1 967; GFX8-NEXT: s_waitcnt lgkmcnt(0) 968; GFX8-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 969; GFX8-NEXT: s_waitcnt lgkmcnt(0) 970; GFX8-NEXT: BB5_2: 971; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 972; GFX8-NEXT: s_waitcnt lgkmcnt(0) 973; GFX8-NEXT: s_mov_b32 s4, s0 974; GFX8-NEXT: v_readfirstlane_b32 s0, v1 975; GFX8-NEXT: v_mul_lo_u32 v1, s3, v0 976; GFX8-NEXT: v_mul_hi_u32 v3, s2, v0 977; GFX8-NEXT: v_mul_lo_u32 v0, s2, v0 978; GFX8-NEXT: s_mov_b32 s5, s1 979; GFX8-NEXT: v_readfirstlane_b32 s1, v2 980; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 981; GFX8-NEXT: v_mov_b32_e32 v2, s1 982; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 983; GFX8-NEXT: s_mov_b32 s7, 0xf000 984; GFX8-NEXT: s_mov_b32 s6, -1 985; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc 986; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 987; GFX8-NEXT: s_endpgm 988; 989; GFX9-LABEL: add_i64_uniform: 990; GFX9: ; %bb.0: ; %entry 991; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 992; GFX9-NEXT: s_mov_b64 s[6:7], exec 993; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 994; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 995; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 996; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 997; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 998; GFX9-NEXT: s_cbranch_execz BB5_2 999; GFX9-NEXT: ; %bb.1: 1000; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1001; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1002; GFX9-NEXT: s_mul_i32 s7, s3, s6 1003; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 1004; GFX9-NEXT: s_add_i32 s8, s8, s7 1005; GFX9-NEXT: s_mul_i32 s6, s2, s6 1006; GFX9-NEXT: v_mov_b32_e32 v1, s6 1007; GFX9-NEXT: v_mov_b32_e32 v2, s8 1008; GFX9-NEXT: v_mov_b32_e32 v3, 0 1009; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1010; GFX9-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1011; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1012; GFX9-NEXT: BB5_2: 1013; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 1014; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1015; GFX9-NEXT: v_mul_lo_u32 v3, s3, v0 1016; GFX9-NEXT: v_mul_hi_u32 v4, s2, v0 1017; GFX9-NEXT: v_mul_lo_u32 v0, s2, v0 1018; GFX9-NEXT: s_mov_b32 s4, s0 1019; GFX9-NEXT: v_readfirstlane_b32 s0, v1 1020; GFX9-NEXT: s_mov_b32 s5, s1 1021; GFX9-NEXT: v_readfirstlane_b32 s1, v2 1022; GFX9-NEXT: v_add_u32_e32 v1, v4, v3 1023; GFX9-NEXT: v_mov_b32_e32 v2, s1 1024; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 1025; GFX9-NEXT: s_mov_b32 s7, 0xf000 1026; GFX9-NEXT: s_mov_b32 s6, -1 1027; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc 1028; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1029; GFX9-NEXT: s_endpgm 1030; 1031; GFX1064-LABEL: add_i64_uniform: 1032; GFX1064: ; %bb.0: ; %entry 1033; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1034; GFX1064-NEXT: s_mov_b64 s[6:7], exec 1035; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2 1036; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1037; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 1038; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1039; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 1040; GFX1064-NEXT: s_cbranch_execz BB5_2 1041; GFX1064-NEXT: ; %bb.1: 1042; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1043; GFX1064-NEXT: v_mov_b32_e32 v3, 0 1044; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1045; GFX1064-NEXT: s_mul_i32 s7, s3, s6 1046; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6 1047; GFX1064-NEXT: s_mul_i32 s6, s2, s6 1048; GFX1064-NEXT: s_add_i32 s8, s8, s7 1049; GFX1064-NEXT: v_mov_b32_e32 v1, s6 1050; GFX1064-NEXT: v_mov_b32_e32 v2, s8 1051; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1052; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1053; GFX1064-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1054; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1055; GFX1064-NEXT: buffer_gl0_inv 1056; GFX1064-NEXT: BB5_2: 1057; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1058; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 1059; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1060; GFX1064-NEXT: v_mul_lo_u32 v3, s3, v0 1061; GFX1064-NEXT: v_mul_hi_u32 v4, s2, v0 1062; GFX1064-NEXT: v_mul_lo_u32 v0, s2, v0 1063; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 1064; GFX1064-NEXT: v_readfirstlane_b32 s4, v2 1065; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1066; GFX1064-NEXT: v_add_nc_u32_e32 v1, v4, v3 1067; GFX1064-NEXT: v_add_co_u32 v0, vcc, s2, v0 1068; GFX1064-NEXT: s_mov_b32 s2, -1 1069; GFX1064-NEXT: v_add_co_ci_u32_e32 v1, vcc, s4, v1, vcc 1070; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1071; GFX1064-NEXT: s_endpgm 1072; 1073; GFX1032-LABEL: add_i64_uniform: 1074; GFX1032: ; %bb.0: ; %entry 1075; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1076; GFX1032-NEXT: s_mov_b32 s5, exec_lo 1077; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2 1078; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 1079; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1080; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 1081; GFX1032-NEXT: s_cbranch_execz BB5_2 1082; GFX1032-NEXT: ; %bb.1: 1083; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 1084; GFX1032-NEXT: v_mov_b32_e32 v3, 0 1085; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1086; GFX1032-NEXT: s_mul_i32 s6, s3, s5 1087; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5 1088; GFX1032-NEXT: s_mul_i32 s5, s2, s5 1089; GFX1032-NEXT: s_add_i32 s7, s7, s6 1090; GFX1032-NEXT: v_mov_b32_e32 v1, s5 1091; GFX1032-NEXT: v_mov_b32_e32 v2, s7 1092; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1093; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1094; GFX1032-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1095; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1096; GFX1032-NEXT: buffer_gl0_inv 1097; GFX1032-NEXT: BB5_2: 1098; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1099; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 1100; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1101; GFX1032-NEXT: v_mul_lo_u32 v3, s3, v0 1102; GFX1032-NEXT: v_mul_hi_u32 v4, s2, v0 1103; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 1104; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 1105; GFX1032-NEXT: v_readfirstlane_b32 s4, v2 1106; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1107; GFX1032-NEXT: v_add_nc_u32_e32 v1, v4, v3 1108; GFX1032-NEXT: v_add_co_u32 v0, vcc_lo, s2, v0 1109; GFX1032-NEXT: s_mov_b32 s2, -1 1110; GFX1032-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo 1111; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1112; GFX1032-NEXT: s_endpgm 1113entry: 1114 %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %additive acq_rel 1115 store i64 %old, i64 addrspace(1)* %out 1116 ret void 1117} 1118 1119define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out) { 1120; 1121; 1122; GFX7LESS-LABEL: add_i64_varying: 1123; GFX7LESS: ; %bb.0: ; %entry 1124; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1125; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 1126; GFX7LESS-NEXT: s_mov_b32 m0, -1 1127; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1128; GFX7LESS-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1129; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1130; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1131; GFX7LESS-NEXT: s_mov_b32 s2, -1 1132; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1133; GFX7LESS-NEXT: s_endpgm 1134; 1135; GFX8-LABEL: add_i64_varying: 1136; GFX8: ; %bb.0: ; %entry 1137; GFX8-NEXT: v_mov_b32_e32 v1, 0 1138; GFX8-NEXT: s_mov_b32 m0, -1 1139; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1140; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1141; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1142; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1143; GFX8-NEXT: s_mov_b32 s3, 0xf000 1144; GFX8-NEXT: s_mov_b32 s2, -1 1145; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1146; GFX8-NEXT: s_endpgm 1147; 1148; GFX9-LABEL: add_i64_varying: 1149; GFX9: ; %bb.0: ; %entry 1150; GFX9-NEXT: v_mov_b32_e32 v1, 0 1151; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1152; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1153; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1154; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1155; GFX9-NEXT: s_mov_b32 s3, 0xf000 1156; GFX9-NEXT: s_mov_b32 s2, -1 1157; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1158; GFX9-NEXT: s_endpgm 1159; 1160; GFX10-LABEL: add_i64_varying: 1161; GFX10: ; %bb.0: ; %entry 1162; GFX10-NEXT: v_mov_b32_e32 v1, 0 1163; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1164; GFX10-NEXT: s_mov_b32 s3, 0x31016000 1165; GFX10-NEXT: s_mov_b32 s2, -1 1166; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1167; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1168; GFX10-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1169; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1170; GFX10-NEXT: buffer_gl0_inv 1171; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1172; GFX10-NEXT: s_endpgm 1173entry: 1174 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1175 %zext = zext i32 %lane to i64 1176 %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %zext acq_rel 1177 store i64 %old, i64 addrspace(1)* %out 1178 ret void 1179} 1180 1181define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) { 1182; 1183; 1184; GFX7LESS-LABEL: sub_i32_constant: 1185; GFX7LESS: ; %bb.0: ; %entry 1186; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 1187; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1188; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1189; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 1190; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1191; GFX7LESS-NEXT: ; implicit-def: $vgpr1 1192; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 1193; GFX7LESS-NEXT: s_cbranch_execz BB7_2 1194; GFX7LESS-NEXT: ; %bb.1: 1195; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1196; GFX7LESS-NEXT: s_mul_i32 s2, s2, 5 1197; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 1198; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 1199; GFX7LESS-NEXT: s_mov_b32 m0, -1 1200; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1201; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 1202; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1203; GFX7LESS-NEXT: BB7_2: 1204; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 1205; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1206; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 1207; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1208; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1209; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 1210; GFX7LESS-NEXT: s_mov_b32 s2, -1 1211; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 1212; GFX7LESS-NEXT: s_endpgm 1213; 1214; GFX8-LABEL: sub_i32_constant: 1215; GFX8: ; %bb.0: ; %entry 1216; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1217; GFX8-NEXT: s_mov_b64 s[2:3], exec 1218; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1219; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1220; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1221; GFX8-NEXT: ; implicit-def: $vgpr1 1222; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 1223; GFX8-NEXT: s_cbranch_execz BB7_2 1224; GFX8-NEXT: ; %bb.1: 1225; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1226; GFX8-NEXT: s_mul_i32 s2, s2, 5 1227; GFX8-NEXT: v_mov_b32_e32 v1, 0 1228; GFX8-NEXT: v_mov_b32_e32 v2, s2 1229; GFX8-NEXT: s_mov_b32 m0, -1 1230; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1231; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2 1232; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1233; GFX8-NEXT: BB7_2: 1234; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1235; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1236; GFX8-NEXT: v_readfirstlane_b32 s2, v1 1237; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1238; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 1239; GFX8-NEXT: s_mov_b32 s3, 0xf000 1240; GFX8-NEXT: s_mov_b32 s2, -1 1241; GFX8-NEXT: s_nop 0 1242; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 1243; GFX8-NEXT: s_endpgm 1244; 1245; GFX9-LABEL: sub_i32_constant: 1246; GFX9: ; %bb.0: ; %entry 1247; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1248; GFX9-NEXT: s_mov_b64 s[2:3], exec 1249; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1250; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1251; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1252; GFX9-NEXT: ; implicit-def: $vgpr1 1253; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 1254; GFX9-NEXT: s_cbranch_execz BB7_2 1255; GFX9-NEXT: ; %bb.1: 1256; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1257; GFX9-NEXT: s_mul_i32 s2, s2, 5 1258; GFX9-NEXT: v_mov_b32_e32 v1, 0 1259; GFX9-NEXT: v_mov_b32_e32 v2, s2 1260; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1261; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 1262; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1263; GFX9-NEXT: BB7_2: 1264; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 1265; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1266; GFX9-NEXT: v_readfirstlane_b32 s2, v1 1267; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1268; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 1269; GFX9-NEXT: s_mov_b32 s3, 0xf000 1270; GFX9-NEXT: s_mov_b32 s2, -1 1271; GFX9-NEXT: s_nop 0 1272; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 1273; GFX9-NEXT: s_endpgm 1274; 1275; GFX1064-LABEL: sub_i32_constant: 1276; GFX1064: ; %bb.0: ; %entry 1277; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1278; GFX1064-NEXT: s_mov_b64 s[2:3], exec 1279; GFX1064-NEXT: ; implicit-def: $vgpr1 1280; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1281; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1282; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1283; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 1284; GFX1064-NEXT: s_cbranch_execz BB7_2 1285; GFX1064-NEXT: ; %bb.1: 1286; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1287; GFX1064-NEXT: v_mov_b32_e32 v1, 0 1288; GFX1064-NEXT: s_mul_i32 s2, s2, 5 1289; GFX1064-NEXT: v_mov_b32_e32 v2, s2 1290; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1291; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1292; GFX1064-NEXT: ds_sub_rtn_u32 v1, v1, v2 1293; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1294; GFX1064-NEXT: buffer_gl0_inv 1295; GFX1064-NEXT: BB7_2: 1296; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1297; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 1298; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 1299; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1300; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1301; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1302; GFX1064-NEXT: s_mov_b32 s2, -1 1303; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1304; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 1305; GFX1064-NEXT: s_endpgm 1306; 1307; GFX1032-LABEL: sub_i32_constant: 1308; GFX1032: ; %bb.0: ; %entry 1309; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1310; GFX1032-NEXT: s_mov_b32 s3, exec_lo 1311; GFX1032-NEXT: ; implicit-def: $vgpr1 1312; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 1313; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1314; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 1315; GFX1032-NEXT: s_cbranch_execz BB7_2 1316; GFX1032-NEXT: ; %bb.1: 1317; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 1318; GFX1032-NEXT: v_mov_b32_e32 v1, 0 1319; GFX1032-NEXT: s_mul_i32 s3, s3, 5 1320; GFX1032-NEXT: v_mov_b32_e32 v2, s3 1321; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1322; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1323; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2 1324; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1325; GFX1032-NEXT: buffer_gl0_inv 1326; GFX1032-NEXT: BB7_2: 1327; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1328; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 1329; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 1330; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1331; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1332; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1333; GFX1032-NEXT: s_mov_b32 s2, -1 1334; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1335; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 1336; GFX1032-NEXT: s_endpgm 1337entry: 1338 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 5 acq_rel 1339 store i32 %old, i32 addrspace(1)* %out 1340 ret void 1341} 1342 1343define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive) { 1344; 1345; 1346; GFX7LESS-LABEL: sub_i32_uniform: 1347; GFX7LESS: ; %bb.0: ; %entry 1348; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 1349; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1350; GFX7LESS-NEXT: s_load_dword s0, s[0:1], 0xb 1351; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1352; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 1353; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1354; GFX7LESS-NEXT: ; implicit-def: $vgpr1 1355; GFX7LESS-NEXT: s_and_saveexec_b64 s[6:7], vcc 1356; GFX7LESS-NEXT: s_cbranch_execz BB8_2 1357; GFX7LESS-NEXT: ; %bb.1: 1358; GFX7LESS-NEXT: s_bcnt1_i32_b64 s1, s[2:3] 1359; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1360; GFX7LESS-NEXT: s_mul_i32 s1, s0, s1 1361; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 1362; GFX7LESS-NEXT: v_mov_b32_e32 v2, s1 1363; GFX7LESS-NEXT: s_mov_b32 m0, -1 1364; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1365; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 1366; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1367; GFX7LESS-NEXT: BB8_2: 1368; GFX7LESS-NEXT: s_or_b64 exec, exec, s[6:7] 1369; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1370; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v1 1371; GFX7LESS-NEXT: v_mul_lo_u32 v0, s0, v0 1372; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 1373; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s1, v0 1374; GFX7LESS-NEXT: s_mov_b32 s6, -1 1375; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 1376; GFX7LESS-NEXT: s_endpgm 1377; 1378; GFX8-LABEL: sub_i32_uniform: 1379; GFX8: ; %bb.0: ; %entry 1380; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1381; GFX8-NEXT: s_load_dword s0, s[0:1], 0x2c 1382; GFX8-NEXT: s_mov_b64 s[2:3], exec 1383; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1384; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1385; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1386; GFX8-NEXT: ; implicit-def: $vgpr1 1387; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc 1388; GFX8-NEXT: s_cbranch_execz BB8_2 1389; GFX8-NEXT: ; %bb.1: 1390; GFX8-NEXT: s_bcnt1_i32_b64 s1, s[2:3] 1391; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1392; GFX8-NEXT: s_mul_i32 s1, s0, s1 1393; GFX8-NEXT: v_mov_b32_e32 v1, 0 1394; GFX8-NEXT: v_mov_b32_e32 v2, s1 1395; GFX8-NEXT: s_mov_b32 m0, -1 1396; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1397; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2 1398; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1399; GFX8-NEXT: BB8_2: 1400; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 1401; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1402; GFX8-NEXT: v_mul_lo_u32 v0, s0, v0 1403; GFX8-NEXT: v_readfirstlane_b32 s0, v1 1404; GFX8-NEXT: s_mov_b32 s7, 0xf000 1405; GFX8-NEXT: s_mov_b32 s6, -1 1406; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 1407; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 1408; GFX8-NEXT: s_endpgm 1409; 1410; GFX9-LABEL: sub_i32_uniform: 1411; GFX9: ; %bb.0: ; %entry 1412; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1413; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c 1414; GFX9-NEXT: s_mov_b64 s[6:7], exec 1415; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1416; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 1417; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1418; GFX9-NEXT: ; implicit-def: $vgpr1 1419; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 1420; GFX9-NEXT: s_cbranch_execz BB8_2 1421; GFX9-NEXT: ; %bb.1: 1422; GFX9-NEXT: s_bcnt1_i32_b64 s3, s[6:7] 1423; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1424; GFX9-NEXT: s_mul_i32 s3, s2, s3 1425; GFX9-NEXT: v_mov_b32_e32 v1, 0 1426; GFX9-NEXT: v_mov_b32_e32 v2, s3 1427; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1428; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 1429; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1430; GFX9-NEXT: BB8_2: 1431; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 1432; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1433; GFX9-NEXT: v_mul_lo_u32 v0, s2, v0 1434; GFX9-NEXT: v_readfirstlane_b32 s0, v1 1435; GFX9-NEXT: s_mov_b32 s7, 0xf000 1436; GFX9-NEXT: s_mov_b32 s6, -1 1437; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 1438; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 1439; GFX9-NEXT: s_endpgm 1440; 1441; GFX1064-LABEL: sub_i32_uniform: 1442; GFX1064: ; %bb.0: ; %entry 1443; GFX1064-NEXT: s_clause 0x1 1444; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1445; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x2c 1446; GFX1064-NEXT: s_mov_b64 s[6:7], exec 1447; GFX1064-NEXT: ; implicit-def: $vgpr1 1448; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1449; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 1450; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1451; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc 1452; GFX1064-NEXT: s_cbranch_execz BB8_2 1453; GFX1064-NEXT: ; %bb.1: 1454; GFX1064-NEXT: s_bcnt1_i32_b64 s3, s[6:7] 1455; GFX1064-NEXT: v_mov_b32_e32 v1, 0 1456; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1457; GFX1064-NEXT: s_mul_i32 s3, s2, s3 1458; GFX1064-NEXT: v_mov_b32_e32 v2, s3 1459; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1460; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1461; GFX1064-NEXT: ds_sub_rtn_u32 v1, v1, v2 1462; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1463; GFX1064-NEXT: buffer_gl0_inv 1464; GFX1064-NEXT: BB8_2: 1465; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1466; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] 1467; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1468; GFX1064-NEXT: v_mul_lo_u32 v0, s2, v0 1469; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 1470; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 1471; GFX1064-NEXT: s_mov_b32 s6, -1 1472; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1473; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0 1474; GFX1064-NEXT: s_endpgm 1475; 1476; GFX1032-LABEL: sub_i32_uniform: 1477; GFX1032: ; %bb.0: ; %entry 1478; GFX1032-NEXT: s_clause 0x1 1479; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1480; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x2c 1481; GFX1032-NEXT: s_mov_b32 s3, exec_lo 1482; GFX1032-NEXT: ; implicit-def: $vgpr1 1483; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 1484; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1485; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 1486; GFX1032-NEXT: s_cbranch_execz BB8_2 1487; GFX1032-NEXT: ; %bb.1: 1488; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s3 1489; GFX1032-NEXT: v_mov_b32_e32 v1, 0 1490; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1491; GFX1032-NEXT: s_mul_i32 s1, s2, s1 1492; GFX1032-NEXT: v_mov_b32_e32 v2, s1 1493; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1494; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1495; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2 1496; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1497; GFX1032-NEXT: buffer_gl0_inv 1498; GFX1032-NEXT: BB8_2: 1499; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1500; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 1501; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1502; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 1503; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 1504; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 1505; GFX1032-NEXT: s_mov_b32 s6, -1 1506; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1507; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 1508; GFX1032-NEXT: s_endpgm 1509entry: 1510 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %subitive acq_rel 1511 store i32 %old, i32 addrspace(1)* %out 1512 ret void 1513} 1514 1515define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { 1516; 1517; 1518; GFX7LESS-LABEL: sub_i32_varying: 1519; GFX7LESS: ; %bb.0: ; %entry 1520; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1521; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 1522; GFX7LESS-NEXT: s_mov_b32 m0, -1 1523; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1524; GFX7LESS-NEXT: ds_sub_rtn_u32 v0, v1, v0 1525; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1526; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1527; GFX7LESS-NEXT: s_mov_b32 s2, -1 1528; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 1529; GFX7LESS-NEXT: s_endpgm 1530; 1531; GFX8-LABEL: sub_i32_varying: 1532; GFX8: ; %bb.0: ; %entry 1533; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1534; GFX8-NEXT: v_mov_b32_e32 v2, v0 1535; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 1536; GFX8-NEXT: v_mov_b32_e32 v1, 0 1537; GFX8-NEXT: s_mov_b64 exec, s[2:3] 1538; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1539; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 1540; GFX8-NEXT: s_not_b64 exec, exec 1541; GFX8-NEXT: v_mov_b32_e32 v2, 0 1542; GFX8-NEXT: s_not_b64 exec, exec 1543; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 1544; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1545; GFX8-NEXT: s_nop 1 1546; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1547; GFX8-NEXT: s_nop 1 1548; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1549; GFX8-NEXT: s_nop 1 1550; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1551; GFX8-NEXT: s_nop 1 1552; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 1553; GFX8-NEXT: s_nop 1 1554; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 1555; GFX8-NEXT: v_readlane_b32 s4, v2, 63 1556; GFX8-NEXT: s_nop 0 1557; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 1558; GFX8-NEXT: s_mov_b64 exec, s[2:3] 1559; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1560; GFX8-NEXT: ; implicit-def: $vgpr0 1561; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 1562; GFX8-NEXT: s_cbranch_execz BB9_2 1563; GFX8-NEXT: ; %bb.1: 1564; GFX8-NEXT: v_mov_b32_e32 v0, 0 1565; GFX8-NEXT: v_mov_b32_e32 v3, s4 1566; GFX8-NEXT: s_mov_b32 m0, -1 1567; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1568; GFX8-NEXT: ds_sub_rtn_u32 v0, v0, v3 1569; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1570; GFX8-NEXT: BB9_2: 1571; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 1572; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1573; GFX8-NEXT: v_readfirstlane_b32 s2, v0 1574; GFX8-NEXT: v_mov_b32_e32 v0, v1 1575; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 1576; GFX8-NEXT: s_mov_b32 s3, 0xf000 1577; GFX8-NEXT: s_mov_b32 s2, -1 1578; GFX8-NEXT: s_nop 0 1579; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 1580; GFX8-NEXT: s_endpgm 1581; 1582; GFX9-LABEL: sub_i32_varying: 1583; GFX9: ; %bb.0: ; %entry 1584; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1585; GFX9-NEXT: v_mov_b32_e32 v2, v0 1586; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 1587; GFX9-NEXT: v_mov_b32_e32 v1, 0 1588; GFX9-NEXT: s_mov_b64 exec, s[2:3] 1589; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1590; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 1591; GFX9-NEXT: s_not_b64 exec, exec 1592; GFX9-NEXT: v_mov_b32_e32 v2, 0 1593; GFX9-NEXT: s_not_b64 exec, exec 1594; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 1595; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1596; GFX9-NEXT: s_nop 1 1597; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1598; GFX9-NEXT: s_nop 1 1599; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1600; GFX9-NEXT: s_nop 1 1601; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1602; GFX9-NEXT: s_nop 1 1603; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 1604; GFX9-NEXT: s_nop 1 1605; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 1606; GFX9-NEXT: v_readlane_b32 s4, v2, 63 1607; GFX9-NEXT: s_nop 0 1608; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 1609; GFX9-NEXT: s_mov_b64 exec, s[2:3] 1610; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1611; GFX9-NEXT: ; implicit-def: $vgpr0 1612; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 1613; GFX9-NEXT: s_cbranch_execz BB9_2 1614; GFX9-NEXT: ; %bb.1: 1615; GFX9-NEXT: v_mov_b32_e32 v0, 0 1616; GFX9-NEXT: v_mov_b32_e32 v3, s4 1617; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1618; GFX9-NEXT: ds_sub_rtn_u32 v0, v0, v3 1619; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1620; GFX9-NEXT: BB9_2: 1621; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 1622; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1623; GFX9-NEXT: v_readfirstlane_b32 s2, v0 1624; GFX9-NEXT: v_mov_b32_e32 v0, v1 1625; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 1626; GFX9-NEXT: s_mov_b32 s3, 0xf000 1627; GFX9-NEXT: s_mov_b32 s2, -1 1628; GFX9-NEXT: s_nop 0 1629; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 1630; GFX9-NEXT: s_endpgm 1631; 1632; GFX1064-LABEL: sub_i32_varying: 1633; GFX1064: ; %bb.0: ; %entry 1634; GFX1064-NEXT: v_mov_b32_e32 v1, v0 1635; GFX1064-NEXT: s_not_b64 exec, exec 1636; GFX1064-NEXT: v_mov_b32_e32 v1, 0 1637; GFX1064-NEXT: s_not_b64 exec, exec 1638; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 1639; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1640; GFX1064-NEXT: v_mov_b32_e32 v3, 0 1641; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1642; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1643; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1644; GFX1064-NEXT: v_mov_b32_e32 v2, v1 1645; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1646; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 1647; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 1648; GFX1064-NEXT: v_mov_b32_e32 v2, s4 1649; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 1650; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 1651; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 1652; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 1653; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1654; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 1655; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 1656; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 1657; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 1658; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1659; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 1660; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 1661; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 1662; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 1663; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 1664; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 1665; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 1666; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 1667; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 1668; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1669; GFX1064-NEXT: s_mov_b32 s2, -1 1670; GFX1064-NEXT: ; implicit-def: $vgpr0 1671; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 1672; GFX1064-NEXT: s_cbranch_execz BB9_2 1673; GFX1064-NEXT: ; %bb.1: 1674; GFX1064-NEXT: v_mov_b32_e32 v0, 0 1675; GFX1064-NEXT: v_mov_b32_e32 v4, s7 1676; GFX1064-NEXT: s_mov_b32 s3, s7 1677; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1678; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1679; GFX1064-NEXT: ds_sub_rtn_u32 v0, v0, v4 1680; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1681; GFX1064-NEXT: buffer_gl0_inv 1682; GFX1064-NEXT: BB9_2: 1683; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1684; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 1685; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 1686; GFX1064-NEXT: v_mov_b32_e32 v0, v3 1687; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s3, v0 1688; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1689; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1690; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 1691; GFX1064-NEXT: s_endpgm 1692; 1693; GFX1032-LABEL: sub_i32_varying: 1694; GFX1032: ; %bb.0: ; %entry 1695; GFX1032-NEXT: v_mov_b32_e32 v1, v0 1696; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 1697; GFX1032-NEXT: v_mov_b32_e32 v1, 0 1698; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 1699; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 1700; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1701; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1702; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1703; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1704; GFX1032-NEXT: v_mov_b32_e32 v2, v1 1705; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1706; GFX1032-NEXT: s_mov_b32 exec_lo, s2 1707; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1708; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 1709; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 1710; GFX1032-NEXT: v_mov_b32_e32 v3, 0 1711; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 1712; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 1713; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 1714; GFX1032-NEXT: s_mov_b32 exec_lo, s2 1715; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1716; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 1717; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 1718; GFX1032-NEXT: s_mov_b32 exec_lo, s2 1719; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1720; GFX1032-NEXT: s_mov_b32 s2, -1 1721; GFX1032-NEXT: ; implicit-def: $vgpr0 1722; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 1723; GFX1032-NEXT: s_cbranch_execz BB9_2 1724; GFX1032-NEXT: ; %bb.1: 1725; GFX1032-NEXT: v_mov_b32_e32 v0, 0 1726; GFX1032-NEXT: v_mov_b32_e32 v4, s4 1727; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1728; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1729; GFX1032-NEXT: ds_sub_rtn_u32 v0, v0, v4 1730; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1731; GFX1032-NEXT: buffer_gl0_inv 1732; GFX1032-NEXT: BB9_2: 1733; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1734; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 1735; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 1736; GFX1032-NEXT: v_mov_b32_e32 v0, v3 1737; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s3, v0 1738; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1739; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1740; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 1741; GFX1032-NEXT: s_endpgm 1742entry: 1743 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1744 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %lane acq_rel 1745 store i32 %old, i32 addrspace(1)* %out 1746 ret void 1747} 1748 1749define amdgpu_kernel void @sub_i32_varying_nouse() { 1750; GFX7LESS-LABEL: sub_i32_varying_nouse: 1751; GFX7LESS: ; %bb.0: ; %entry 1752; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 1753; GFX7LESS-NEXT: s_mov_b32 m0, -1 1754; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1755; GFX7LESS-NEXT: ds_sub_u32 v1, v0 1756; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1757; GFX7LESS-NEXT: s_endpgm 1758; 1759; GFX8-LABEL: sub_i32_varying_nouse: 1760; GFX8: ; %bb.0: ; %entry 1761; GFX8-NEXT: v_mov_b32_e32 v1, v0 1762; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1763; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 1764; GFX8-NEXT: s_not_b64 exec, exec 1765; GFX8-NEXT: v_mov_b32_e32 v1, 0 1766; GFX8-NEXT: s_not_b64 exec, exec 1767; GFX8-NEXT: s_or_saveexec_b64 s[0:1], -1 1768; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1769; GFX8-NEXT: s_nop 1 1770; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1771; GFX8-NEXT: s_nop 1 1772; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1773; GFX8-NEXT: s_nop 1 1774; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1775; GFX8-NEXT: s_nop 1 1776; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 1777; GFX8-NEXT: s_nop 1 1778; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 1779; GFX8-NEXT: v_readlane_b32 s2, v1, 63 1780; GFX8-NEXT: s_mov_b64 exec, s[0:1] 1781; GFX8-NEXT: s_mov_b32 s0, s2 1782; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1783; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 1784; GFX8-NEXT: s_cbranch_execz BB10_2 1785; GFX8-NEXT: ; %bb.1: 1786; GFX8-NEXT: v_mov_b32_e32 v0, 0 1787; GFX8-NEXT: v_mov_b32_e32 v2, s0 1788; GFX8-NEXT: s_mov_b32 m0, -1 1789; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1790; GFX8-NEXT: ds_sub_u32 v0, v2 1791; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1792; GFX8-NEXT: BB10_2: 1793; GFX8-NEXT: s_endpgm 1794; 1795; GFX9-LABEL: sub_i32_varying_nouse: 1796; GFX9: ; %bb.0: ; %entry 1797; GFX9-NEXT: v_mov_b32_e32 v1, v0 1798; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1799; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 1800; GFX9-NEXT: s_not_b64 exec, exec 1801; GFX9-NEXT: v_mov_b32_e32 v1, 0 1802; GFX9-NEXT: s_not_b64 exec, exec 1803; GFX9-NEXT: s_or_saveexec_b64 s[0:1], -1 1804; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1805; GFX9-NEXT: s_nop 1 1806; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1807; GFX9-NEXT: s_nop 1 1808; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1809; GFX9-NEXT: s_nop 1 1810; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1811; GFX9-NEXT: s_nop 1 1812; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 1813; GFX9-NEXT: s_nop 1 1814; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 1815; GFX9-NEXT: v_readlane_b32 s2, v1, 63 1816; GFX9-NEXT: s_mov_b64 exec, s[0:1] 1817; GFX9-NEXT: s_mov_b32 s0, s2 1818; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1819; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 1820; GFX9-NEXT: s_cbranch_execz BB10_2 1821; GFX9-NEXT: ; %bb.1: 1822; GFX9-NEXT: v_mov_b32_e32 v0, 0 1823; GFX9-NEXT: v_mov_b32_e32 v2, s0 1824; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1825; GFX9-NEXT: ds_sub_u32 v0, v2 1826; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1827; GFX9-NEXT: BB10_2: 1828; GFX9-NEXT: s_endpgm 1829; 1830; GFX1064-LABEL: sub_i32_varying_nouse: 1831; GFX1064: ; %bb.0: ; %entry 1832; GFX1064-NEXT: v_mov_b32_e32 v1, v0 1833; GFX1064-NEXT: s_not_b64 exec, exec 1834; GFX1064-NEXT: v_mov_b32_e32 v1, 0 1835; GFX1064-NEXT: s_not_b64 exec, exec 1836; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 1837; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1838; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1839; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1840; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1841; GFX1064-NEXT: v_mov_b32_e32 v2, v1 1842; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1843; GFX1064-NEXT: v_add_nc_u32_e32 v1, v1, v2 1844; GFX1064-NEXT: s_mov_b64 exec, s[0:1] 1845; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1846; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 1847; GFX1064-NEXT: v_readlane_b32 s2, v1, 0 1848; GFX1064-NEXT: v_readlane_b32 s3, v1, 32 1849; GFX1064-NEXT: s_mov_b64 exec, s[0:1] 1850; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 1851; GFX1064-NEXT: s_add_i32 s0, s2, s3 1852; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1853; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 1854; GFX1064-NEXT: s_cbranch_execz BB10_2 1855; GFX1064-NEXT: ; %bb.1: 1856; GFX1064-NEXT: v_mov_b32_e32 v0, 0 1857; GFX1064-NEXT: v_mov_b32_e32 v3, s0 1858; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1859; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1860; GFX1064-NEXT: ds_sub_u32 v0, v3 1861; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1862; GFX1064-NEXT: buffer_gl0_inv 1863; GFX1064-NEXT: BB10_2: 1864; GFX1064-NEXT: s_endpgm 1865; 1866; GFX1032-LABEL: sub_i32_varying_nouse: 1867; GFX1032: ; %bb.0: ; %entry 1868; GFX1032-NEXT: v_mov_b32_e32 v1, v0 1869; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 1870; GFX1032-NEXT: v_mov_b32_e32 v1, 0 1871; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 1872; GFX1032-NEXT: s_or_saveexec_b32 s0, -1 1873; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1874; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1875; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1876; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1877; GFX1032-NEXT: v_mov_b32_e32 v2, v1 1878; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1879; GFX1032-NEXT: v_add_nc_u32_e32 v1, v1, v2 1880; GFX1032-NEXT: s_mov_b32 exec_lo, s0 1881; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 1882; GFX1032-NEXT: v_mov_b32_e32 v0, v1 1883; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 1884; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 1885; GFX1032-NEXT: s_cbranch_execz BB10_2 1886; GFX1032-NEXT: ; %bb.1: 1887; GFX1032-NEXT: v_mov_b32_e32 v3, 0 1888; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1889; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1890; GFX1032-NEXT: ds_sub_u32 v3, v0 1891; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1892; GFX1032-NEXT: buffer_gl0_inv 1893; GFX1032-NEXT: BB10_2: 1894; GFX1032-NEXT: s_endpgm 1895entry: 1896 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1897 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %lane acq_rel 1898 ret void 1899} 1900 1901define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { 1902; 1903; 1904; GFX7LESS-LABEL: sub_i64_constant: 1905; GFX7LESS: ; %bb.0: ; %entry 1906; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec 1907; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1908; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 1909; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 1910; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1911; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 1912; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 1913; GFX7LESS-NEXT: s_cbranch_execz BB11_2 1914; GFX7LESS-NEXT: ; %bb.1: 1915; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1916; GFX7LESS-NEXT: s_mul_i32 s4, s4, 5 1917; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 1918; GFX7LESS-NEXT: v_mov_b32_e32 v1, s4 1919; GFX7LESS-NEXT: s_mov_b32 m0, -1 1920; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1921; GFX7LESS-NEXT: ds_sub_rtn_u64 v[1:2], v2, v[1:2] 1922; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1923; GFX7LESS-NEXT: BB11_2: 1924; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 1925; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1926; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 1927; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v2 1928; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 1929; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1930; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1931; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 1932; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 1933; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 1934; GFX7LESS-NEXT: s_mov_b32 s2, -1 1935; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1936; GFX7LESS-NEXT: s_endpgm 1937; 1938; GFX8-LABEL: sub_i64_constant: 1939; GFX8: ; %bb.0: ; %entry 1940; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1941; GFX8-NEXT: s_mov_b64 s[4:5], exec 1942; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1943; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 1944; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1945; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 1946; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 1947; GFX8-NEXT: s_cbranch_execz BB11_2 1948; GFX8-NEXT: ; %bb.1: 1949; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1950; GFX8-NEXT: s_mul_i32 s4, s4, 5 1951; GFX8-NEXT: v_mov_b32_e32 v1, s4 1952; GFX8-NEXT: v_mov_b32_e32 v2, 0 1953; GFX8-NEXT: s_mov_b32 m0, -1 1954; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1955; GFX8-NEXT: ds_sub_rtn_u64 v[1:2], v2, v[1:2] 1956; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1957; GFX8-NEXT: BB11_2: 1958; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 1959; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1960; GFX8-NEXT: v_readfirstlane_b32 s3, v2 1961; GFX8-NEXT: v_readfirstlane_b32 s2, v1 1962; GFX8-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 1963; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1964; GFX8-NEXT: v_mov_b32_e32 v2, s3 1965; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 1966; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 1967; GFX8-NEXT: s_mov_b32 s3, 0xf000 1968; GFX8-NEXT: s_mov_b32 s2, -1 1969; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1970; GFX8-NEXT: s_endpgm 1971; 1972; GFX9-LABEL: sub_i64_constant: 1973; GFX9: ; %bb.0: ; %entry 1974; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1975; GFX9-NEXT: s_mov_b64 s[4:5], exec 1976; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1977; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 1978; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1979; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 1980; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 1981; GFX9-NEXT: s_cbranch_execz BB11_2 1982; GFX9-NEXT: ; %bb.1: 1983; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1984; GFX9-NEXT: s_mul_i32 s4, s4, 5 1985; GFX9-NEXT: v_mov_b32_e32 v1, s4 1986; GFX9-NEXT: v_mov_b32_e32 v2, 0 1987; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1988; GFX9-NEXT: ds_sub_rtn_u64 v[1:2], v2, v[1:2] 1989; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1990; GFX9-NEXT: BB11_2: 1991; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 1992; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1993; GFX9-NEXT: v_readfirstlane_b32 s3, v2 1994; GFX9-NEXT: v_readfirstlane_b32 s2, v1 1995; GFX9-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 1996; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1997; GFX9-NEXT: v_mov_b32_e32 v2, s3 1998; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s2, v0 1999; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc 2000; GFX9-NEXT: s_mov_b32 s3, 0xf000 2001; GFX9-NEXT: s_mov_b32 s2, -1 2002; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2003; GFX9-NEXT: s_endpgm 2004; 2005; GFX1064-LABEL: sub_i64_constant: 2006; GFX1064: ; %bb.0: ; %entry 2007; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2008; GFX1064-NEXT: s_mov_b64 s[4:5], exec 2009; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2 2010; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 2011; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 2012; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2013; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 2014; GFX1064-NEXT: s_cbranch_execz BB11_2 2015; GFX1064-NEXT: ; %bb.1: 2016; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2017; GFX1064-NEXT: v_mov_b32_e32 v2, 0 2018; GFX1064-NEXT: s_mul_i32 s4, s4, 5 2019; GFX1064-NEXT: v_mov_b32_e32 v1, s4 2020; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2021; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2022; GFX1064-NEXT: ds_sub_rtn_u64 v[1:2], v2, v[1:2] 2023; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2024; GFX1064-NEXT: buffer_gl0_inv 2025; GFX1064-NEXT: BB11_2: 2026; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2027; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 2028; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 2029; GFX1064-NEXT: v_mul_u32_u24_e32 v1, 5, v0 2030; GFX1064-NEXT: v_readfirstlane_b32 s3, v2 2031; GFX1064-NEXT: v_mul_hi_u32_u24_e32 v2, 5, v0 2032; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v1 2033; GFX1064-NEXT: s_mov_b32 s2, -1 2034; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc 2035; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2036; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2037; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2038; GFX1064-NEXT: s_endpgm 2039; 2040; GFX1032-LABEL: sub_i64_constant: 2041; GFX1032: ; %bb.0: ; %entry 2042; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2043; GFX1032-NEXT: s_mov_b32 s3, exec_lo 2044; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2 2045; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 2046; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2047; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 2048; GFX1032-NEXT: s_cbranch_execz BB11_2 2049; GFX1032-NEXT: ; %bb.1: 2050; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 2051; GFX1032-NEXT: v_mov_b32_e32 v2, 0 2052; GFX1032-NEXT: s_mul_i32 s3, s3, 5 2053; GFX1032-NEXT: v_mov_b32_e32 v1, s3 2054; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2055; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2056; GFX1032-NEXT: ds_sub_rtn_u64 v[1:2], v2, v[1:2] 2057; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2058; GFX1032-NEXT: buffer_gl0_inv 2059; GFX1032-NEXT: BB11_2: 2060; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 2061; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 2062; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 2063; GFX1032-NEXT: v_mul_u32_u24_e32 v1, 5, v0 2064; GFX1032-NEXT: v_readfirstlane_b32 s3, v2 2065; GFX1032-NEXT: v_mul_hi_u32_u24_e32 v2, 5, v0 2066; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v1 2067; GFX1032-NEXT: s_mov_b32 s2, -1 2068; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo 2069; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2070; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2071; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2072; GFX1032-NEXT: s_endpgm 2073entry: 2074 %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 5 acq_rel 2075 store i64 %old, i64 addrspace(1)* %out 2076 ret void 2077} 2078 2079define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive) { 2080; 2081; 2082; GFX7LESS-LABEL: sub_i64_uniform: 2083; GFX7LESS: ; %bb.0: ; %entry 2084; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 2085; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 2086; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 2087; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 2088; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2089; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 2090; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 2091; GFX7LESS-NEXT: s_cbranch_execz BB12_2 2092; GFX7LESS-NEXT: ; %bb.1: 2093; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 2094; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0 2095; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2096; GFX7LESS-NEXT: s_mul_i32 s7, s3, s6 2097; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 2098; GFX7LESS-NEXT: v_mul_hi_u32 v1, s2, v1 2099; GFX7LESS-NEXT: s_mul_i32 s6, s2, s6 2100; GFX7LESS-NEXT: v_add_i32_e32 v2, vcc, s7, v1 2101; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 2102; GFX7LESS-NEXT: s_mov_b32 m0, -1 2103; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2104; GFX7LESS-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2105; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2106; GFX7LESS-NEXT: BB12_2: 2107; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 2108; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 2109; GFX7LESS-NEXT: s_mov_b32 s6, -1 2110; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2111; GFX7LESS-NEXT: s_mov_b32 s4, s0 2112; GFX7LESS-NEXT: s_mov_b32 s5, s1 2113; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 2114; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v2 2115; GFX7LESS-NEXT: v_mul_lo_u32 v1, s3, v0 2116; GFX7LESS-NEXT: v_mul_hi_u32 v2, s2, v0 2117; GFX7LESS-NEXT: v_mul_lo_u32 v0, s2, v0 2118; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v2, v1 2119; GFX7LESS-NEXT: v_mov_b32_e32 v2, s1 2120; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 2121; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 2122; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2123; GFX7LESS-NEXT: s_endpgm 2124; 2125; GFX8-LABEL: sub_i64_uniform: 2126; GFX8: ; %bb.0: ; %entry 2127; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2128; GFX8-NEXT: s_mov_b64 s[6:7], exec 2129; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 2130; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 2131; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2132; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 2133; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 2134; GFX8-NEXT: s_cbranch_execz BB12_2 2135; GFX8-NEXT: ; %bb.1: 2136; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 2137; GFX8-NEXT: v_mov_b32_e32 v1, s6 2138; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2139; GFX8-NEXT: v_mul_hi_u32 v1, s2, v1 2140; GFX8-NEXT: s_mul_i32 s7, s3, s6 2141; GFX8-NEXT: s_mul_i32 s6, s2, s6 2142; GFX8-NEXT: v_mov_b32_e32 v3, 0 2143; GFX8-NEXT: v_add_u32_e32 v2, vcc, s7, v1 2144; GFX8-NEXT: v_mov_b32_e32 v1, s6 2145; GFX8-NEXT: s_mov_b32 m0, -1 2146; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2147; GFX8-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2148; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2149; GFX8-NEXT: BB12_2: 2150; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 2151; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2152; GFX8-NEXT: s_mov_b32 s4, s0 2153; GFX8-NEXT: v_readfirstlane_b32 s0, v1 2154; GFX8-NEXT: v_mul_lo_u32 v1, s3, v0 2155; GFX8-NEXT: v_mul_hi_u32 v3, s2, v0 2156; GFX8-NEXT: v_mul_lo_u32 v0, s2, v0 2157; GFX8-NEXT: s_mov_b32 s5, s1 2158; GFX8-NEXT: v_readfirstlane_b32 s1, v2 2159; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 2160; GFX8-NEXT: v_mov_b32_e32 v2, s1 2161; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 2162; GFX8-NEXT: s_mov_b32 s7, 0xf000 2163; GFX8-NEXT: s_mov_b32 s6, -1 2164; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 2165; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2166; GFX8-NEXT: s_endpgm 2167; 2168; GFX9-LABEL: sub_i64_uniform: 2169; GFX9: ; %bb.0: ; %entry 2170; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2171; GFX9-NEXT: s_mov_b64 s[6:7], exec 2172; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 2173; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 2174; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2175; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 2176; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 2177; GFX9-NEXT: s_cbranch_execz BB12_2 2178; GFX9-NEXT: ; %bb.1: 2179; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 2180; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2181; GFX9-NEXT: s_mul_i32 s7, s3, s6 2182; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 2183; GFX9-NEXT: s_add_i32 s8, s8, s7 2184; GFX9-NEXT: s_mul_i32 s6, s2, s6 2185; GFX9-NEXT: v_mov_b32_e32 v1, s6 2186; GFX9-NEXT: v_mov_b32_e32 v2, s8 2187; GFX9-NEXT: v_mov_b32_e32 v3, 0 2188; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2189; GFX9-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2190; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2191; GFX9-NEXT: BB12_2: 2192; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 2193; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2194; GFX9-NEXT: v_mul_lo_u32 v3, s3, v0 2195; GFX9-NEXT: v_mul_hi_u32 v4, s2, v0 2196; GFX9-NEXT: v_mul_lo_u32 v0, s2, v0 2197; GFX9-NEXT: s_mov_b32 s4, s0 2198; GFX9-NEXT: v_readfirstlane_b32 s0, v1 2199; GFX9-NEXT: s_mov_b32 s5, s1 2200; GFX9-NEXT: v_readfirstlane_b32 s1, v2 2201; GFX9-NEXT: v_add_u32_e32 v1, v4, v3 2202; GFX9-NEXT: v_mov_b32_e32 v2, s1 2203; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v0 2204; GFX9-NEXT: s_mov_b32 s7, 0xf000 2205; GFX9-NEXT: s_mov_b32 s6, -1 2206; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc 2207; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2208; GFX9-NEXT: s_endpgm 2209; 2210; GFX1064-LABEL: sub_i64_uniform: 2211; GFX1064: ; %bb.0: ; %entry 2212; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2213; GFX1064-NEXT: s_mov_b64 s[6:7], exec 2214; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2 2215; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 2216; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 2217; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2218; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 2219; GFX1064-NEXT: s_cbranch_execz BB12_2 2220; GFX1064-NEXT: ; %bb.1: 2221; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 2222; GFX1064-NEXT: v_mov_b32_e32 v3, 0 2223; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2224; GFX1064-NEXT: s_mul_i32 s7, s3, s6 2225; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6 2226; GFX1064-NEXT: s_mul_i32 s6, s2, s6 2227; GFX1064-NEXT: s_add_i32 s8, s8, s7 2228; GFX1064-NEXT: v_mov_b32_e32 v1, s6 2229; GFX1064-NEXT: v_mov_b32_e32 v2, s8 2230; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2231; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2232; GFX1064-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2233; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2234; GFX1064-NEXT: buffer_gl0_inv 2235; GFX1064-NEXT: BB12_2: 2236; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2237; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 2238; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2239; GFX1064-NEXT: v_mul_lo_u32 v3, s3, v0 2240; GFX1064-NEXT: v_mul_hi_u32 v4, s2, v0 2241; GFX1064-NEXT: v_mul_lo_u32 v0, s2, v0 2242; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 2243; GFX1064-NEXT: v_readfirstlane_b32 s4, v2 2244; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2245; GFX1064-NEXT: v_add_nc_u32_e32 v1, v4, v3 2246; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v0 2247; GFX1064-NEXT: s_mov_b32 s2, -1 2248; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc 2249; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2250; GFX1064-NEXT: s_endpgm 2251; 2252; GFX1032-LABEL: sub_i64_uniform: 2253; GFX1032: ; %bb.0: ; %entry 2254; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2255; GFX1032-NEXT: s_mov_b32 s5, exec_lo 2256; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2 2257; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 2258; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2259; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 2260; GFX1032-NEXT: s_cbranch_execz BB12_2 2261; GFX1032-NEXT: ; %bb.1: 2262; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 2263; GFX1032-NEXT: v_mov_b32_e32 v3, 0 2264; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2265; GFX1032-NEXT: s_mul_i32 s6, s3, s5 2266; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5 2267; GFX1032-NEXT: s_mul_i32 s5, s2, s5 2268; GFX1032-NEXT: s_add_i32 s7, s7, s6 2269; GFX1032-NEXT: v_mov_b32_e32 v1, s5 2270; GFX1032-NEXT: v_mov_b32_e32 v2, s7 2271; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2272; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2273; GFX1032-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2274; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2275; GFX1032-NEXT: buffer_gl0_inv 2276; GFX1032-NEXT: BB12_2: 2277; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 2278; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 2279; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2280; GFX1032-NEXT: v_mul_lo_u32 v3, s3, v0 2281; GFX1032-NEXT: v_mul_hi_u32 v4, s2, v0 2282; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 2283; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 2284; GFX1032-NEXT: v_readfirstlane_b32 s4, v2 2285; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2286; GFX1032-NEXT: v_add_nc_u32_e32 v1, v4, v3 2287; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0 2288; GFX1032-NEXT: s_mov_b32 s2, -1 2289; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo 2290; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2291; GFX1032-NEXT: s_endpgm 2292entry: 2293 %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %subitive acq_rel 2294 store i64 %old, i64 addrspace(1)* %out 2295 ret void 2296} 2297 2298define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out) { 2299; 2300; 2301; GFX7LESS-LABEL: sub_i64_varying: 2302; GFX7LESS: ; %bb.0: ; %entry 2303; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2304; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 2305; GFX7LESS-NEXT: s_mov_b32 m0, -1 2306; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2307; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 2308; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2309; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2310; GFX7LESS-NEXT: s_mov_b32 s2, -1 2311; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2312; GFX7LESS-NEXT: s_endpgm 2313; 2314; GFX8-LABEL: sub_i64_varying: 2315; GFX8: ; %bb.0: ; %entry 2316; GFX8-NEXT: v_mov_b32_e32 v1, 0 2317; GFX8-NEXT: s_mov_b32 m0, -1 2318; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2319; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2320; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 2321; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2322; GFX8-NEXT: s_mov_b32 s3, 0xf000 2323; GFX8-NEXT: s_mov_b32 s2, -1 2324; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2325; GFX8-NEXT: s_endpgm 2326; 2327; GFX9-LABEL: sub_i64_varying: 2328; GFX9: ; %bb.0: ; %entry 2329; GFX9-NEXT: v_mov_b32_e32 v1, 0 2330; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2331; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2332; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 2333; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2334; GFX9-NEXT: s_mov_b32 s3, 0xf000 2335; GFX9-NEXT: s_mov_b32 s2, -1 2336; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2337; GFX9-NEXT: s_endpgm 2338; 2339; GFX10-LABEL: sub_i64_varying: 2340; GFX10: ; %bb.0: ; %entry 2341; GFX10-NEXT: v_mov_b32_e32 v1, 0 2342; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2343; GFX10-NEXT: s_mov_b32 s3, 0x31016000 2344; GFX10-NEXT: s_mov_b32 s2, -1 2345; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2346; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2347; GFX10-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 2348; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2349; GFX10-NEXT: buffer_gl0_inv 2350; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2351; GFX10-NEXT: s_endpgm 2352entry: 2353 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2354 %zext = zext i32 %lane to i64 2355 %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %zext acq_rel 2356 store i64 %old, i64 addrspace(1)* %out 2357 ret void 2358} 2359 2360define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { 2361; 2362; 2363; GFX7LESS-LABEL: and_i32_varying: 2364; GFX7LESS: ; %bb.0: ; %entry 2365; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2366; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 2367; GFX7LESS-NEXT: s_mov_b32 m0, -1 2368; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2369; GFX7LESS-NEXT: ds_and_rtn_b32 v0, v1, v0 2370; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2371; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2372; GFX7LESS-NEXT: s_mov_b32 s2, -1 2373; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 2374; GFX7LESS-NEXT: s_endpgm 2375; 2376; GFX8-LABEL: and_i32_varying: 2377; GFX8: ; %bb.0: ; %entry 2378; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2379; GFX8-NEXT: v_mov_b32_e32 v2, v0 2380; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2381; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2382; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2383; GFX8-NEXT: v_mov_b32_e32 v1, -1 2384; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2385; GFX8-NEXT: s_not_b64 exec, exec 2386; GFX8-NEXT: v_mov_b32_e32 v2, -1 2387; GFX8-NEXT: s_not_b64 exec, exec 2388; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2389; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 2390; GFX8-NEXT: s_nop 1 2391; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 2392; GFX8-NEXT: s_nop 1 2393; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 2394; GFX8-NEXT: s_nop 1 2395; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 2396; GFX8-NEXT: s_nop 1 2397; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2398; GFX8-NEXT: s_nop 1 2399; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2400; GFX8-NEXT: v_readlane_b32 s4, v2, 63 2401; GFX8-NEXT: s_nop 0 2402; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2403; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2404; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2405; GFX8-NEXT: ; implicit-def: $vgpr0 2406; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 2407; GFX8-NEXT: s_cbranch_execz BB14_2 2408; GFX8-NEXT: ; %bb.1: 2409; GFX8-NEXT: v_mov_b32_e32 v0, 0 2410; GFX8-NEXT: v_mov_b32_e32 v3, s4 2411; GFX8-NEXT: s_mov_b32 m0, -1 2412; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2413; GFX8-NEXT: ds_and_rtn_b32 v0, v0, v3 2414; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2415; GFX8-NEXT: BB14_2: 2416; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 2417; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2418; GFX8-NEXT: v_readfirstlane_b32 s2, v0 2419; GFX8-NEXT: v_mov_b32_e32 v0, v1 2420; GFX8-NEXT: v_and_b32_e32 v0, s2, v0 2421; GFX8-NEXT: s_mov_b32 s3, 0xf000 2422; GFX8-NEXT: s_mov_b32 s2, -1 2423; GFX8-NEXT: s_nop 0 2424; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 2425; GFX8-NEXT: s_endpgm 2426; 2427; GFX9-LABEL: and_i32_varying: 2428; GFX9: ; %bb.0: ; %entry 2429; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2430; GFX9-NEXT: v_mov_b32_e32 v2, v0 2431; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2432; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2433; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2434; GFX9-NEXT: v_mov_b32_e32 v1, -1 2435; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2436; GFX9-NEXT: s_not_b64 exec, exec 2437; GFX9-NEXT: v_mov_b32_e32 v2, -1 2438; GFX9-NEXT: s_not_b64 exec, exec 2439; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2440; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 2441; GFX9-NEXT: s_nop 1 2442; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 2443; GFX9-NEXT: s_nop 1 2444; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 2445; GFX9-NEXT: s_nop 1 2446; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 2447; GFX9-NEXT: s_nop 1 2448; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2449; GFX9-NEXT: s_nop 1 2450; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2451; GFX9-NEXT: v_readlane_b32 s4, v2, 63 2452; GFX9-NEXT: s_nop 0 2453; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2454; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2455; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2456; GFX9-NEXT: ; implicit-def: $vgpr0 2457; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 2458; GFX9-NEXT: s_cbranch_execz BB14_2 2459; GFX9-NEXT: ; %bb.1: 2460; GFX9-NEXT: v_mov_b32_e32 v0, 0 2461; GFX9-NEXT: v_mov_b32_e32 v3, s4 2462; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2463; GFX9-NEXT: ds_and_rtn_b32 v0, v0, v3 2464; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2465; GFX9-NEXT: BB14_2: 2466; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 2467; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2468; GFX9-NEXT: v_readfirstlane_b32 s2, v0 2469; GFX9-NEXT: v_mov_b32_e32 v0, v1 2470; GFX9-NEXT: v_and_b32_e32 v0, s2, v0 2471; GFX9-NEXT: s_mov_b32 s3, 0xf000 2472; GFX9-NEXT: s_mov_b32 s2, -1 2473; GFX9-NEXT: s_nop 0 2474; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 2475; GFX9-NEXT: s_endpgm 2476; 2477; GFX1064-LABEL: and_i32_varying: 2478; GFX1064: ; %bb.0: ; %entry 2479; GFX1064-NEXT: v_mov_b32_e32 v1, v0 2480; GFX1064-NEXT: s_not_b64 exec, exec 2481; GFX1064-NEXT: v_mov_b32_e32 v1, -1 2482; GFX1064-NEXT: s_not_b64 exec, exec 2483; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2484; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2485; GFX1064-NEXT: v_mov_b32_e32 v3, -1 2486; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 2487; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 2488; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 2489; GFX1064-NEXT: v_mov_b32_e32 v2, v1 2490; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2491; GFX1064-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2492; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 2493; GFX1064-NEXT: v_mov_b32_e32 v2, s4 2494; GFX1064-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 2495; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 2496; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2497; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2498; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2499; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2500; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 2501; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 2502; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2503; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2504; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2505; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 2506; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 2507; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 2508; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2509; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2510; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 2511; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 2512; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 2513; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2514; GFX1064-NEXT: s_mov_b32 s2, -1 2515; GFX1064-NEXT: ; implicit-def: $vgpr0 2516; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 2517; GFX1064-NEXT: s_cbranch_execz BB14_2 2518; GFX1064-NEXT: ; %bb.1: 2519; GFX1064-NEXT: v_mov_b32_e32 v0, 0 2520; GFX1064-NEXT: v_mov_b32_e32 v4, s7 2521; GFX1064-NEXT: s_mov_b32 s3, s7 2522; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2523; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2524; GFX1064-NEXT: ds_and_rtn_b32 v0, v0, v4 2525; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2526; GFX1064-NEXT: buffer_gl0_inv 2527; GFX1064-NEXT: BB14_2: 2528; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2529; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 2530; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 2531; GFX1064-NEXT: v_mov_b32_e32 v0, v3 2532; GFX1064-NEXT: v_and_b32_e32 v0, s3, v0 2533; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2534; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2535; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 2536; GFX1064-NEXT: s_endpgm 2537; 2538; GFX1032-LABEL: and_i32_varying: 2539; GFX1032: ; %bb.0: ; %entry 2540; GFX1032-NEXT: v_mov_b32_e32 v1, v0 2541; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2542; GFX1032-NEXT: v_mov_b32_e32 v1, -1 2543; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2544; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2545; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2546; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 2547; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 2548; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 2549; GFX1032-NEXT: v_mov_b32_e32 v2, v1 2550; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2551; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2552; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2553; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2554; GFX1032-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2555; GFX1032-NEXT: v_mov_b32_e32 v3, -1 2556; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 2557; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 2558; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2559; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2560; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2561; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2562; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 2563; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2564; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2565; GFX1032-NEXT: s_mov_b32 s2, -1 2566; GFX1032-NEXT: ; implicit-def: $vgpr0 2567; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 2568; GFX1032-NEXT: s_cbranch_execz BB14_2 2569; GFX1032-NEXT: ; %bb.1: 2570; GFX1032-NEXT: v_mov_b32_e32 v0, 0 2571; GFX1032-NEXT: v_mov_b32_e32 v4, s4 2572; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2573; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2574; GFX1032-NEXT: ds_and_rtn_b32 v0, v0, v4 2575; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2576; GFX1032-NEXT: buffer_gl0_inv 2577; GFX1032-NEXT: BB14_2: 2578; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 2579; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 2580; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 2581; GFX1032-NEXT: v_mov_b32_e32 v0, v3 2582; GFX1032-NEXT: v_and_b32_e32 v0, s3, v0 2583; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2584; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2585; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 2586; GFX1032-NEXT: s_endpgm 2587entry: 2588 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2589 %old = atomicrmw and i32 addrspace(3)* @local_var32, i32 %lane acq_rel 2590 store i32 %old, i32 addrspace(1)* %out 2591 ret void 2592} 2593 2594define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { 2595; 2596; 2597; GFX7LESS-LABEL: or_i32_varying: 2598; GFX7LESS: ; %bb.0: ; %entry 2599; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2600; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 2601; GFX7LESS-NEXT: s_mov_b32 m0, -1 2602; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2603; GFX7LESS-NEXT: ds_or_rtn_b32 v0, v1, v0 2604; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2605; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2606; GFX7LESS-NEXT: s_mov_b32 s2, -1 2607; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 2608; GFX7LESS-NEXT: s_endpgm 2609; 2610; GFX8-LABEL: or_i32_varying: 2611; GFX8: ; %bb.0: ; %entry 2612; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2613; GFX8-NEXT: v_mov_b32_e32 v2, v0 2614; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2615; GFX8-NEXT: v_mov_b32_e32 v1, 0 2616; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2617; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2618; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2619; GFX8-NEXT: s_not_b64 exec, exec 2620; GFX8-NEXT: v_mov_b32_e32 v2, 0 2621; GFX8-NEXT: s_not_b64 exec, exec 2622; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2623; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2624; GFX8-NEXT: s_nop 1 2625; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2626; GFX8-NEXT: s_nop 1 2627; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2628; GFX8-NEXT: s_nop 1 2629; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2630; GFX8-NEXT: s_nop 1 2631; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2632; GFX8-NEXT: s_nop 1 2633; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2634; GFX8-NEXT: v_readlane_b32 s4, v2, 63 2635; GFX8-NEXT: s_nop 0 2636; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2637; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2638; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2639; GFX8-NEXT: ; implicit-def: $vgpr0 2640; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 2641; GFX8-NEXT: s_cbranch_execz BB15_2 2642; GFX8-NEXT: ; %bb.1: 2643; GFX8-NEXT: v_mov_b32_e32 v0, 0 2644; GFX8-NEXT: v_mov_b32_e32 v3, s4 2645; GFX8-NEXT: s_mov_b32 m0, -1 2646; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2647; GFX8-NEXT: ds_or_rtn_b32 v0, v0, v3 2648; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2649; GFX8-NEXT: BB15_2: 2650; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 2651; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2652; GFX8-NEXT: v_readfirstlane_b32 s2, v0 2653; GFX8-NEXT: v_mov_b32_e32 v0, v1 2654; GFX8-NEXT: v_or_b32_e32 v0, s2, v0 2655; GFX8-NEXT: s_mov_b32 s3, 0xf000 2656; GFX8-NEXT: s_mov_b32 s2, -1 2657; GFX8-NEXT: s_nop 0 2658; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 2659; GFX8-NEXT: s_endpgm 2660; 2661; GFX9-LABEL: or_i32_varying: 2662; GFX9: ; %bb.0: ; %entry 2663; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2664; GFX9-NEXT: v_mov_b32_e32 v2, v0 2665; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2666; GFX9-NEXT: v_mov_b32_e32 v1, 0 2667; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2668; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2669; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2670; GFX9-NEXT: s_not_b64 exec, exec 2671; GFX9-NEXT: v_mov_b32_e32 v2, 0 2672; GFX9-NEXT: s_not_b64 exec, exec 2673; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2674; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2675; GFX9-NEXT: s_nop 1 2676; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2677; GFX9-NEXT: s_nop 1 2678; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2679; GFX9-NEXT: s_nop 1 2680; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2681; GFX9-NEXT: s_nop 1 2682; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2683; GFX9-NEXT: s_nop 1 2684; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2685; GFX9-NEXT: v_readlane_b32 s4, v2, 63 2686; GFX9-NEXT: s_nop 0 2687; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2688; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2689; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2690; GFX9-NEXT: ; implicit-def: $vgpr0 2691; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 2692; GFX9-NEXT: s_cbranch_execz BB15_2 2693; GFX9-NEXT: ; %bb.1: 2694; GFX9-NEXT: v_mov_b32_e32 v0, 0 2695; GFX9-NEXT: v_mov_b32_e32 v3, s4 2696; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2697; GFX9-NEXT: ds_or_rtn_b32 v0, v0, v3 2698; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2699; GFX9-NEXT: BB15_2: 2700; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 2701; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2702; GFX9-NEXT: v_readfirstlane_b32 s2, v0 2703; GFX9-NEXT: v_mov_b32_e32 v0, v1 2704; GFX9-NEXT: v_or_b32_e32 v0, s2, v0 2705; GFX9-NEXT: s_mov_b32 s3, 0xf000 2706; GFX9-NEXT: s_mov_b32 s2, -1 2707; GFX9-NEXT: s_nop 0 2708; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 2709; GFX9-NEXT: s_endpgm 2710; 2711; GFX1064-LABEL: or_i32_varying: 2712; GFX1064: ; %bb.0: ; %entry 2713; GFX1064-NEXT: v_mov_b32_e32 v1, v0 2714; GFX1064-NEXT: s_not_b64 exec, exec 2715; GFX1064-NEXT: v_mov_b32_e32 v1, 0 2716; GFX1064-NEXT: s_not_b64 exec, exec 2717; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2718; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2719; GFX1064-NEXT: v_mov_b32_e32 v3, 0 2720; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2721; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2722; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2723; GFX1064-NEXT: v_mov_b32_e32 v2, v1 2724; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2725; GFX1064-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2726; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 2727; GFX1064-NEXT: v_mov_b32_e32 v2, s4 2728; GFX1064-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 2729; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 2730; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2731; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2732; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2733; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2734; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 2735; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 2736; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2737; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2738; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2739; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 2740; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 2741; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 2742; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2743; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2744; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 2745; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 2746; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 2747; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2748; GFX1064-NEXT: s_mov_b32 s2, -1 2749; GFX1064-NEXT: ; implicit-def: $vgpr0 2750; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 2751; GFX1064-NEXT: s_cbranch_execz BB15_2 2752; GFX1064-NEXT: ; %bb.1: 2753; GFX1064-NEXT: v_mov_b32_e32 v0, 0 2754; GFX1064-NEXT: v_mov_b32_e32 v4, s7 2755; GFX1064-NEXT: s_mov_b32 s3, s7 2756; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2757; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2758; GFX1064-NEXT: ds_or_rtn_b32 v0, v0, v4 2759; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2760; GFX1064-NEXT: buffer_gl0_inv 2761; GFX1064-NEXT: BB15_2: 2762; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2763; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 2764; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 2765; GFX1064-NEXT: v_mov_b32_e32 v0, v3 2766; GFX1064-NEXT: v_or_b32_e32 v0, s3, v0 2767; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2768; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2769; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 2770; GFX1064-NEXT: s_endpgm 2771; 2772; GFX1032-LABEL: or_i32_varying: 2773; GFX1032: ; %bb.0: ; %entry 2774; GFX1032-NEXT: v_mov_b32_e32 v1, v0 2775; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2776; GFX1032-NEXT: v_mov_b32_e32 v1, 0 2777; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2778; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2779; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2780; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2781; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2782; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2783; GFX1032-NEXT: v_mov_b32_e32 v2, v1 2784; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2785; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2786; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2787; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2788; GFX1032-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2789; GFX1032-NEXT: v_mov_b32_e32 v3, 0 2790; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 2791; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 2792; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2793; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2794; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2795; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2796; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 2797; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2798; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2799; GFX1032-NEXT: s_mov_b32 s2, -1 2800; GFX1032-NEXT: ; implicit-def: $vgpr0 2801; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 2802; GFX1032-NEXT: s_cbranch_execz BB15_2 2803; GFX1032-NEXT: ; %bb.1: 2804; GFX1032-NEXT: v_mov_b32_e32 v0, 0 2805; GFX1032-NEXT: v_mov_b32_e32 v4, s4 2806; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2807; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2808; GFX1032-NEXT: ds_or_rtn_b32 v0, v0, v4 2809; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2810; GFX1032-NEXT: buffer_gl0_inv 2811; GFX1032-NEXT: BB15_2: 2812; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 2813; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 2814; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 2815; GFX1032-NEXT: v_mov_b32_e32 v0, v3 2816; GFX1032-NEXT: v_or_b32_e32 v0, s3, v0 2817; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2818; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2819; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 2820; GFX1032-NEXT: s_endpgm 2821entry: 2822 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2823 %old = atomicrmw or i32 addrspace(3)* @local_var32, i32 %lane acq_rel 2824 store i32 %old, i32 addrspace(1)* %out 2825 ret void 2826} 2827 2828define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { 2829; 2830; 2831; GFX7LESS-LABEL: xor_i32_varying: 2832; GFX7LESS: ; %bb.0: ; %entry 2833; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2834; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 2835; GFX7LESS-NEXT: s_mov_b32 m0, -1 2836; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2837; GFX7LESS-NEXT: ds_xor_rtn_b32 v0, v1, v0 2838; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2839; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2840; GFX7LESS-NEXT: s_mov_b32 s2, -1 2841; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 2842; GFX7LESS-NEXT: s_endpgm 2843; 2844; GFX8-LABEL: xor_i32_varying: 2845; GFX8: ; %bb.0: ; %entry 2846; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2847; GFX8-NEXT: v_mov_b32_e32 v2, v0 2848; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2849; GFX8-NEXT: v_mov_b32_e32 v1, 0 2850; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2851; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2852; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2853; GFX8-NEXT: s_not_b64 exec, exec 2854; GFX8-NEXT: v_mov_b32_e32 v2, 0 2855; GFX8-NEXT: s_not_b64 exec, exec 2856; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2857; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2858; GFX8-NEXT: s_nop 1 2859; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2860; GFX8-NEXT: s_nop 1 2861; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2862; GFX8-NEXT: s_nop 1 2863; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2864; GFX8-NEXT: s_nop 1 2865; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2866; GFX8-NEXT: s_nop 1 2867; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2868; GFX8-NEXT: v_readlane_b32 s4, v2, 63 2869; GFX8-NEXT: s_nop 0 2870; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2871; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2872; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2873; GFX8-NEXT: ; implicit-def: $vgpr0 2874; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 2875; GFX8-NEXT: s_cbranch_execz BB16_2 2876; GFX8-NEXT: ; %bb.1: 2877; GFX8-NEXT: v_mov_b32_e32 v0, 0 2878; GFX8-NEXT: v_mov_b32_e32 v3, s4 2879; GFX8-NEXT: s_mov_b32 m0, -1 2880; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2881; GFX8-NEXT: ds_xor_rtn_b32 v0, v0, v3 2882; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2883; GFX8-NEXT: BB16_2: 2884; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 2885; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2886; GFX8-NEXT: v_readfirstlane_b32 s2, v0 2887; GFX8-NEXT: v_mov_b32_e32 v0, v1 2888; GFX8-NEXT: v_xor_b32_e32 v0, s2, v0 2889; GFX8-NEXT: s_mov_b32 s3, 0xf000 2890; GFX8-NEXT: s_mov_b32 s2, -1 2891; GFX8-NEXT: s_nop 0 2892; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 2893; GFX8-NEXT: s_endpgm 2894; 2895; GFX9-LABEL: xor_i32_varying: 2896; GFX9: ; %bb.0: ; %entry 2897; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2898; GFX9-NEXT: v_mov_b32_e32 v2, v0 2899; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2900; GFX9-NEXT: v_mov_b32_e32 v1, 0 2901; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2902; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2903; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2904; GFX9-NEXT: s_not_b64 exec, exec 2905; GFX9-NEXT: v_mov_b32_e32 v2, 0 2906; GFX9-NEXT: s_not_b64 exec, exec 2907; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2908; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2909; GFX9-NEXT: s_nop 1 2910; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2911; GFX9-NEXT: s_nop 1 2912; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2913; GFX9-NEXT: s_nop 1 2914; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2915; GFX9-NEXT: s_nop 1 2916; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2917; GFX9-NEXT: s_nop 1 2918; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2919; GFX9-NEXT: v_readlane_b32 s4, v2, 63 2920; GFX9-NEXT: s_nop 0 2921; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2922; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2923; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2924; GFX9-NEXT: ; implicit-def: $vgpr0 2925; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 2926; GFX9-NEXT: s_cbranch_execz BB16_2 2927; GFX9-NEXT: ; %bb.1: 2928; GFX9-NEXT: v_mov_b32_e32 v0, 0 2929; GFX9-NEXT: v_mov_b32_e32 v3, s4 2930; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2931; GFX9-NEXT: ds_xor_rtn_b32 v0, v0, v3 2932; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2933; GFX9-NEXT: BB16_2: 2934; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 2935; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2936; GFX9-NEXT: v_readfirstlane_b32 s2, v0 2937; GFX9-NEXT: v_mov_b32_e32 v0, v1 2938; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 2939; GFX9-NEXT: s_mov_b32 s3, 0xf000 2940; GFX9-NEXT: s_mov_b32 s2, -1 2941; GFX9-NEXT: s_nop 0 2942; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 2943; GFX9-NEXT: s_endpgm 2944; 2945; GFX1064-LABEL: xor_i32_varying: 2946; GFX1064: ; %bb.0: ; %entry 2947; GFX1064-NEXT: v_mov_b32_e32 v1, v0 2948; GFX1064-NEXT: s_not_b64 exec, exec 2949; GFX1064-NEXT: v_mov_b32_e32 v1, 0 2950; GFX1064-NEXT: s_not_b64 exec, exec 2951; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2952; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2953; GFX1064-NEXT: v_mov_b32_e32 v3, 0 2954; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2955; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2956; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2957; GFX1064-NEXT: v_mov_b32_e32 v2, v1 2958; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2959; GFX1064-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2960; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 2961; GFX1064-NEXT: v_mov_b32_e32 v2, s4 2962; GFX1064-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 2963; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 2964; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2965; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2966; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2967; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2968; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 2969; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 2970; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2971; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2972; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2973; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 2974; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 2975; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 2976; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2977; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2978; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 2979; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 2980; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 2981; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2982; GFX1064-NEXT: s_mov_b32 s2, -1 2983; GFX1064-NEXT: ; implicit-def: $vgpr0 2984; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 2985; GFX1064-NEXT: s_cbranch_execz BB16_2 2986; GFX1064-NEXT: ; %bb.1: 2987; GFX1064-NEXT: v_mov_b32_e32 v0, 0 2988; GFX1064-NEXT: v_mov_b32_e32 v4, s7 2989; GFX1064-NEXT: s_mov_b32 s3, s7 2990; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2991; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2992; GFX1064-NEXT: ds_xor_rtn_b32 v0, v0, v4 2993; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2994; GFX1064-NEXT: buffer_gl0_inv 2995; GFX1064-NEXT: BB16_2: 2996; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2997; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 2998; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 2999; GFX1064-NEXT: v_mov_b32_e32 v0, v3 3000; GFX1064-NEXT: v_xor_b32_e32 v0, s3, v0 3001; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3002; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3003; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 3004; GFX1064-NEXT: s_endpgm 3005; 3006; GFX1032-LABEL: xor_i32_varying: 3007; GFX1032: ; %bb.0: ; %entry 3008; GFX1032-NEXT: v_mov_b32_e32 v1, v0 3009; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3010; GFX1032-NEXT: v_mov_b32_e32 v1, 0 3011; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3012; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3013; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 3014; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 3015; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 3016; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 3017; GFX1032-NEXT: v_mov_b32_e32 v2, v1 3018; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 3019; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3020; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3021; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3022; GFX1032-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3023; GFX1032-NEXT: v_mov_b32_e32 v3, 0 3024; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 3025; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 3026; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3027; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3028; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3029; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3030; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 3031; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3032; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3033; GFX1032-NEXT: s_mov_b32 s2, -1 3034; GFX1032-NEXT: ; implicit-def: $vgpr0 3035; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 3036; GFX1032-NEXT: s_cbranch_execz BB16_2 3037; GFX1032-NEXT: ; %bb.1: 3038; GFX1032-NEXT: v_mov_b32_e32 v0, 0 3039; GFX1032-NEXT: v_mov_b32_e32 v4, s4 3040; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3041; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3042; GFX1032-NEXT: ds_xor_rtn_b32 v0, v0, v4 3043; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3044; GFX1032-NEXT: buffer_gl0_inv 3045; GFX1032-NEXT: BB16_2: 3046; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3047; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 3048; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 3049; GFX1032-NEXT: v_mov_b32_e32 v0, v3 3050; GFX1032-NEXT: v_xor_b32_e32 v0, s3, v0 3051; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3052; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3053; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 3054; GFX1032-NEXT: s_endpgm 3055entry: 3056 %lane = call i32 @llvm.amdgcn.workitem.id.x() 3057 %old = atomicrmw xor i32 addrspace(3)* @local_var32, i32 %lane acq_rel 3058 store i32 %old, i32 addrspace(1)* %out 3059 ret void 3060} 3061 3062define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { 3063; 3064; 3065; GFX7LESS-LABEL: max_i32_varying: 3066; GFX7LESS: ; %bb.0: ; %entry 3067; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3068; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 3069; GFX7LESS-NEXT: s_mov_b32 m0, -1 3070; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3071; GFX7LESS-NEXT: ds_max_rtn_i32 v0, v1, v0 3072; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3073; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3074; GFX7LESS-NEXT: s_mov_b32 s2, -1 3075; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 3076; GFX7LESS-NEXT: s_endpgm 3077; 3078; GFX8-LABEL: max_i32_varying: 3079; GFX8: ; %bb.0: ; %entry 3080; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3081; GFX8-NEXT: v_mov_b32_e32 v2, v0 3082; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3083; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3084; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3085; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 3086; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3087; GFX8-NEXT: s_not_b64 exec, exec 3088; GFX8-NEXT: v_mov_b32_e32 v2, v1 3089; GFX8-NEXT: s_not_b64 exec, exec 3090; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3091; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3092; GFX8-NEXT: s_nop 1 3093; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3094; GFX8-NEXT: s_nop 1 3095; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3096; GFX8-NEXT: s_nop 1 3097; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3098; GFX8-NEXT: s_nop 1 3099; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3100; GFX8-NEXT: s_nop 1 3101; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3102; GFX8-NEXT: v_readlane_b32 s4, v2, 63 3103; GFX8-NEXT: s_nop 0 3104; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3105; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3106; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3107; GFX8-NEXT: ; implicit-def: $vgpr0 3108; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 3109; GFX8-NEXT: s_cbranch_execz BB17_2 3110; GFX8-NEXT: ; %bb.1: 3111; GFX8-NEXT: v_mov_b32_e32 v0, 0 3112; GFX8-NEXT: v_mov_b32_e32 v3, s4 3113; GFX8-NEXT: s_mov_b32 m0, -1 3114; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3115; GFX8-NEXT: ds_max_rtn_i32 v0, v0, v3 3116; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3117; GFX8-NEXT: BB17_2: 3118; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 3119; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3120; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3121; GFX8-NEXT: v_mov_b32_e32 v0, v1 3122; GFX8-NEXT: v_max_i32_e32 v0, s2, v0 3123; GFX8-NEXT: s_mov_b32 s3, 0xf000 3124; GFX8-NEXT: s_mov_b32 s2, -1 3125; GFX8-NEXT: s_nop 0 3126; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 3127; GFX8-NEXT: s_endpgm 3128; 3129; GFX9-LABEL: max_i32_varying: 3130; GFX9: ; %bb.0: ; %entry 3131; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3132; GFX9-NEXT: v_mov_b32_e32 v2, v0 3133; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3134; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3135; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3136; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 3137; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3138; GFX9-NEXT: s_not_b64 exec, exec 3139; GFX9-NEXT: v_mov_b32_e32 v2, v1 3140; GFX9-NEXT: s_not_b64 exec, exec 3141; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3142; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3143; GFX9-NEXT: s_nop 1 3144; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3145; GFX9-NEXT: s_nop 1 3146; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3147; GFX9-NEXT: s_nop 1 3148; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3149; GFX9-NEXT: s_nop 1 3150; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3151; GFX9-NEXT: s_nop 1 3152; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3153; GFX9-NEXT: v_readlane_b32 s4, v2, 63 3154; GFX9-NEXT: s_nop 0 3155; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3156; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3157; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3158; GFX9-NEXT: ; implicit-def: $vgpr0 3159; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 3160; GFX9-NEXT: s_cbranch_execz BB17_2 3161; GFX9-NEXT: ; %bb.1: 3162; GFX9-NEXT: v_mov_b32_e32 v0, 0 3163; GFX9-NEXT: v_mov_b32_e32 v3, s4 3164; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3165; GFX9-NEXT: ds_max_rtn_i32 v0, v0, v3 3166; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3167; GFX9-NEXT: BB17_2: 3168; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 3169; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3170; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3171; GFX9-NEXT: v_mov_b32_e32 v0, v1 3172; GFX9-NEXT: v_max_i32_e32 v0, s2, v0 3173; GFX9-NEXT: s_mov_b32 s3, 0xf000 3174; GFX9-NEXT: s_mov_b32 s2, -1 3175; GFX9-NEXT: s_nop 0 3176; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 3177; GFX9-NEXT: s_endpgm 3178; 3179; GFX1064-LABEL: max_i32_varying: 3180; GFX1064: ; %bb.0: ; %entry 3181; GFX1064-NEXT: v_mov_b32_e32 v2, v0 3182; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3183; GFX1064-NEXT: v_bfrev_b32_e32 v1, 1 3184; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3185; GFX1064-NEXT: s_not_b64 exec, exec 3186; GFX1064-NEXT: v_mov_b32_e32 v2, v1 3187; GFX1064-NEXT: s_not_b64 exec, exec 3188; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3189; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3190; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3191; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3192; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3193; GFX1064-NEXT: v_mov_b32_e32 v3, v2 3194; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 3195; GFX1064-NEXT: v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3196; GFX1064-NEXT: v_readlane_b32 s4, v2, 31 3197; GFX1064-NEXT: v_mov_b32_e32 v3, s4 3198; GFX1064-NEXT: v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3199; GFX1064-NEXT: v_readlane_b32 s4, v2, 15 3200; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3201; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3202; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3203; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3204; GFX1064-NEXT: v_readlane_b32 s5, v2, 31 3205; GFX1064-NEXT: v_writelane_b32 v1, s4, 16 3206; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3207; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3208; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3209; GFX1064-NEXT: v_readlane_b32 s7, v2, 63 3210; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 3211; GFX1064-NEXT: v_writelane_b32 v1, s5, 32 3212; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3213; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3214; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 3215; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 3216; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 3217; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3218; GFX1064-NEXT: s_mov_b32 s2, -1 3219; GFX1064-NEXT: ; implicit-def: $vgpr0 3220; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 3221; GFX1064-NEXT: s_cbranch_execz BB17_2 3222; GFX1064-NEXT: ; %bb.1: 3223; GFX1064-NEXT: v_mov_b32_e32 v0, 0 3224; GFX1064-NEXT: v_mov_b32_e32 v4, s7 3225; GFX1064-NEXT: s_mov_b32 s3, s7 3226; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3227; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3228; GFX1064-NEXT: ds_max_rtn_i32 v0, v0, v4 3229; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3230; GFX1064-NEXT: buffer_gl0_inv 3231; GFX1064-NEXT: BB17_2: 3232; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3233; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 3234; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 3235; GFX1064-NEXT: v_mov_b32_e32 v0, v1 3236; GFX1064-NEXT: v_max_i32_e32 v0, s3, v0 3237; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3238; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3239; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 3240; GFX1064-NEXT: s_endpgm 3241; 3242; GFX1032-LABEL: max_i32_varying: 3243; GFX1032: ; %bb.0: ; %entry 3244; GFX1032-NEXT: v_mov_b32_e32 v2, v0 3245; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3246; GFX1032-NEXT: v_bfrev_b32_e32 v1, 1 3247; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3248; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3249; GFX1032-NEXT: v_mov_b32_e32 v2, v1 3250; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3251; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3252; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3253; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3254; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3255; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3256; GFX1032-NEXT: v_mov_b32_e32 v3, v2 3257; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 3258; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3259; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3260; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3261; GFX1032-NEXT: v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3262; GFX1032-NEXT: v_readlane_b32 s3, v2, 15 3263; GFX1032-NEXT: v_readlane_b32 s4, v2, 31 3264; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3265; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3266; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3267; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3268; GFX1032-NEXT: v_writelane_b32 v1, s3, 16 3269; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3270; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3271; GFX1032-NEXT: s_mov_b32 s2, -1 3272; GFX1032-NEXT: ; implicit-def: $vgpr0 3273; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 3274; GFX1032-NEXT: s_cbranch_execz BB17_2 3275; GFX1032-NEXT: ; %bb.1: 3276; GFX1032-NEXT: v_mov_b32_e32 v0, 0 3277; GFX1032-NEXT: v_mov_b32_e32 v4, s4 3278; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3279; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3280; GFX1032-NEXT: ds_max_rtn_i32 v0, v0, v4 3281; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3282; GFX1032-NEXT: buffer_gl0_inv 3283; GFX1032-NEXT: BB17_2: 3284; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3285; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 3286; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 3287; GFX1032-NEXT: v_mov_b32_e32 v0, v1 3288; GFX1032-NEXT: v_max_i32_e32 v0, s3, v0 3289; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3290; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3291; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 3292; GFX1032-NEXT: s_endpgm 3293entry: 3294 %lane = call i32 @llvm.amdgcn.workitem.id.x() 3295 %old = atomicrmw max i32 addrspace(3)* @local_var32, i32 %lane acq_rel 3296 store i32 %old, i32 addrspace(1)* %out 3297 ret void 3298} 3299 3300define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { 3301; 3302; 3303; GFX7LESS-LABEL: max_i64_constant: 3304; GFX7LESS: ; %bb.0: ; %entry 3305; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3306; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3307; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 3308; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3309; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 3310; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 3311; GFX7LESS-NEXT: s_cbranch_execz BB18_2 3312; GFX7LESS-NEXT: ; %bb.1: 3313; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 3314; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 3315; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 3316; GFX7LESS-NEXT: s_mov_b32 m0, -1 3317; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3318; GFX7LESS-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3319; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3320; GFX7LESS-NEXT: BB18_2: 3321; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 3322; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3323; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 3324; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 3325; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, 1 3326; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 3327; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3328; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 3329; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 3330; GFX7LESS-NEXT: v_mov_b32_e32 v3, s4 3331; GFX7LESS-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1] 3332; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3333; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 3334; GFX7LESS-NEXT: s_mov_b32 s2, -1 3335; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3336; GFX7LESS-NEXT: s_endpgm 3337; 3338; GFX8-LABEL: max_i64_constant: 3339; GFX8: ; %bb.0: ; %entry 3340; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3341; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3342; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3343; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3344; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 3345; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 3346; GFX8-NEXT: s_cbranch_execz BB18_2 3347; GFX8-NEXT: ; %bb.1: 3348; GFX8-NEXT: v_mov_b32_e32 v0, 5 3349; GFX8-NEXT: v_mov_b32_e32 v2, 0 3350; GFX8-NEXT: v_mov_b32_e32 v1, 0 3351; GFX8-NEXT: s_mov_b32 m0, -1 3352; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3353; GFX8-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3354; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3355; GFX8-NEXT: BB18_2: 3356; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 3357; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3358; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3359; GFX8-NEXT: v_bfrev_b32_e32 v0, 1 3360; GFX8-NEXT: v_readfirstlane_b32 s3, v1 3361; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 3362; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 3363; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] 3364; GFX8-NEXT: v_mov_b32_e32 v2, s3 3365; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3366; GFX8-NEXT: v_mov_b32_e32 v2, s2 3367; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 3368; GFX8-NEXT: s_mov_b32 s3, 0xf000 3369; GFX8-NEXT: s_mov_b32 s2, -1 3370; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3371; GFX8-NEXT: s_endpgm 3372; 3373; GFX9-LABEL: max_i64_constant: 3374; GFX9: ; %bb.0: ; %entry 3375; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3376; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3377; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3378; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3379; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 3380; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 3381; GFX9-NEXT: s_cbranch_execz BB18_2 3382; GFX9-NEXT: ; %bb.1: 3383; GFX9-NEXT: v_mov_b32_e32 v0, 5 3384; GFX9-NEXT: v_mov_b32_e32 v1, 0 3385; GFX9-NEXT: v_mov_b32_e32 v2, 0 3386; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3387; GFX9-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3388; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3389; GFX9-NEXT: BB18_2: 3390; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 3391; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3392; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3393; GFX9-NEXT: v_bfrev_b32_e32 v0, 1 3394; GFX9-NEXT: v_readfirstlane_b32 s3, v1 3395; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 3396; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 3397; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] 3398; GFX9-NEXT: v_mov_b32_e32 v2, s3 3399; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3400; GFX9-NEXT: v_mov_b32_e32 v2, s2 3401; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 3402; GFX9-NEXT: s_mov_b32 s3, 0xf000 3403; GFX9-NEXT: s_mov_b32 s2, -1 3404; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3405; GFX9-NEXT: s_endpgm 3406; 3407; GFX1064-LABEL: max_i64_constant: 3408; GFX1064: ; %bb.0: ; %entry 3409; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3410; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3411; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3412; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3413; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 3414; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 3415; GFX1064-NEXT: s_cbranch_execz BB18_2 3416; GFX1064-NEXT: ; %bb.1: 3417; GFX1064-NEXT: v_mov_b32_e32 v0, 5 3418; GFX1064-NEXT: v_mov_b32_e32 v1, 0 3419; GFX1064-NEXT: v_mov_b32_e32 v2, 0 3420; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3421; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3422; GFX1064-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3423; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3424; GFX1064-NEXT: buffer_gl0_inv 3425; GFX1064-NEXT: BB18_2: 3426; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3427; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 3428; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 3429; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 3430; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc 3431; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 3432; GFX1064-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] 3433; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 3434; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 3435; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3436; GFX1064-NEXT: s_mov_b32 s2, -1 3437; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3438; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3439; GFX1064-NEXT: s_endpgm 3440; 3441; GFX1032-LABEL: max_i64_constant: 3442; GFX1032: ; %bb.0: ; %entry 3443; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3444; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3445; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3446; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 3447; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 3448; GFX1032-NEXT: s_cbranch_execz BB18_2 3449; GFX1032-NEXT: ; %bb.1: 3450; GFX1032-NEXT: v_mov_b32_e32 v0, 5 3451; GFX1032-NEXT: v_mov_b32_e32 v1, 0 3452; GFX1032-NEXT: v_mov_b32_e32 v2, 0 3453; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3454; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3455; GFX1032-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3456; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3457; GFX1032-NEXT: buffer_gl0_inv 3458; GFX1032-NEXT: BB18_2: 3459; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3460; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 3461; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 3462; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 3463; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo 3464; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo 3465; GFX1032-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1] 3466; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 3467; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 3468; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3469; GFX1032-NEXT: s_mov_b32 s2, -1 3470; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3471; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3472; GFX1032-NEXT: s_endpgm 3473entry: 3474 %old = atomicrmw max i64 addrspace(3)* @local_var64, i64 5 acq_rel 3475 store i64 %old, i64 addrspace(1)* %out 3476 ret void 3477} 3478 3479define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { 3480; 3481; 3482; GFX7LESS-LABEL: min_i32_varying: 3483; GFX7LESS: ; %bb.0: ; %entry 3484; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3485; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 3486; GFX7LESS-NEXT: s_mov_b32 m0, -1 3487; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3488; GFX7LESS-NEXT: ds_min_rtn_i32 v0, v1, v0 3489; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3490; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3491; GFX7LESS-NEXT: s_mov_b32 s2, -1 3492; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 3493; GFX7LESS-NEXT: s_endpgm 3494; 3495; GFX8-LABEL: min_i32_varying: 3496; GFX8: ; %bb.0: ; %entry 3497; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3498; GFX8-NEXT: v_mov_b32_e32 v2, v0 3499; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3500; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3501; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3502; GFX8-NEXT: v_bfrev_b32_e32 v1, -2 3503; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3504; GFX8-NEXT: s_not_b64 exec, exec 3505; GFX8-NEXT: v_mov_b32_e32 v2, v1 3506; GFX8-NEXT: s_not_b64 exec, exec 3507; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3508; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3509; GFX8-NEXT: s_nop 1 3510; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3511; GFX8-NEXT: s_nop 1 3512; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3513; GFX8-NEXT: s_nop 1 3514; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3515; GFX8-NEXT: s_nop 1 3516; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3517; GFX8-NEXT: s_nop 1 3518; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3519; GFX8-NEXT: v_readlane_b32 s4, v2, 63 3520; GFX8-NEXT: s_nop 0 3521; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3522; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3523; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3524; GFX8-NEXT: ; implicit-def: $vgpr0 3525; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 3526; GFX8-NEXT: s_cbranch_execz BB19_2 3527; GFX8-NEXT: ; %bb.1: 3528; GFX8-NEXT: v_mov_b32_e32 v0, 0 3529; GFX8-NEXT: v_mov_b32_e32 v3, s4 3530; GFX8-NEXT: s_mov_b32 m0, -1 3531; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3532; GFX8-NEXT: ds_min_rtn_i32 v0, v0, v3 3533; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3534; GFX8-NEXT: BB19_2: 3535; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 3536; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3537; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3538; GFX8-NEXT: v_mov_b32_e32 v0, v1 3539; GFX8-NEXT: v_min_i32_e32 v0, s2, v0 3540; GFX8-NEXT: s_mov_b32 s3, 0xf000 3541; GFX8-NEXT: s_mov_b32 s2, -1 3542; GFX8-NEXT: s_nop 0 3543; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 3544; GFX8-NEXT: s_endpgm 3545; 3546; GFX9-LABEL: min_i32_varying: 3547; GFX9: ; %bb.0: ; %entry 3548; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3549; GFX9-NEXT: v_mov_b32_e32 v2, v0 3550; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3551; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3552; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3553; GFX9-NEXT: v_bfrev_b32_e32 v1, -2 3554; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3555; GFX9-NEXT: s_not_b64 exec, exec 3556; GFX9-NEXT: v_mov_b32_e32 v2, v1 3557; GFX9-NEXT: s_not_b64 exec, exec 3558; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3559; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3560; GFX9-NEXT: s_nop 1 3561; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3562; GFX9-NEXT: s_nop 1 3563; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3564; GFX9-NEXT: s_nop 1 3565; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3566; GFX9-NEXT: s_nop 1 3567; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3568; GFX9-NEXT: s_nop 1 3569; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3570; GFX9-NEXT: v_readlane_b32 s4, v2, 63 3571; GFX9-NEXT: s_nop 0 3572; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3573; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3574; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3575; GFX9-NEXT: ; implicit-def: $vgpr0 3576; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 3577; GFX9-NEXT: s_cbranch_execz BB19_2 3578; GFX9-NEXT: ; %bb.1: 3579; GFX9-NEXT: v_mov_b32_e32 v0, 0 3580; GFX9-NEXT: v_mov_b32_e32 v3, s4 3581; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3582; GFX9-NEXT: ds_min_rtn_i32 v0, v0, v3 3583; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3584; GFX9-NEXT: BB19_2: 3585; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 3586; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3587; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3588; GFX9-NEXT: v_mov_b32_e32 v0, v1 3589; GFX9-NEXT: v_min_i32_e32 v0, s2, v0 3590; GFX9-NEXT: s_mov_b32 s3, 0xf000 3591; GFX9-NEXT: s_mov_b32 s2, -1 3592; GFX9-NEXT: s_nop 0 3593; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 3594; GFX9-NEXT: s_endpgm 3595; 3596; GFX1064-LABEL: min_i32_varying: 3597; GFX1064: ; %bb.0: ; %entry 3598; GFX1064-NEXT: v_mov_b32_e32 v2, v0 3599; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3600; GFX1064-NEXT: v_bfrev_b32_e32 v1, -2 3601; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3602; GFX1064-NEXT: s_not_b64 exec, exec 3603; GFX1064-NEXT: v_mov_b32_e32 v2, v1 3604; GFX1064-NEXT: s_not_b64 exec, exec 3605; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3606; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3607; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3608; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3609; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3610; GFX1064-NEXT: v_mov_b32_e32 v3, v2 3611; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 3612; GFX1064-NEXT: v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3613; GFX1064-NEXT: v_readlane_b32 s4, v2, 31 3614; GFX1064-NEXT: v_mov_b32_e32 v3, s4 3615; GFX1064-NEXT: v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3616; GFX1064-NEXT: v_readlane_b32 s4, v2, 15 3617; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3618; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3619; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3620; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3621; GFX1064-NEXT: v_readlane_b32 s5, v2, 31 3622; GFX1064-NEXT: v_writelane_b32 v1, s4, 16 3623; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3624; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3625; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3626; GFX1064-NEXT: v_readlane_b32 s7, v2, 63 3627; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 3628; GFX1064-NEXT: v_writelane_b32 v1, s5, 32 3629; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3630; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3631; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 3632; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 3633; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 3634; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3635; GFX1064-NEXT: s_mov_b32 s2, -1 3636; GFX1064-NEXT: ; implicit-def: $vgpr0 3637; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 3638; GFX1064-NEXT: s_cbranch_execz BB19_2 3639; GFX1064-NEXT: ; %bb.1: 3640; GFX1064-NEXT: v_mov_b32_e32 v0, 0 3641; GFX1064-NEXT: v_mov_b32_e32 v4, s7 3642; GFX1064-NEXT: s_mov_b32 s3, s7 3643; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3644; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3645; GFX1064-NEXT: ds_min_rtn_i32 v0, v0, v4 3646; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3647; GFX1064-NEXT: buffer_gl0_inv 3648; GFX1064-NEXT: BB19_2: 3649; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3650; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 3651; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 3652; GFX1064-NEXT: v_mov_b32_e32 v0, v1 3653; GFX1064-NEXT: v_min_i32_e32 v0, s3, v0 3654; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3655; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3656; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 3657; GFX1064-NEXT: s_endpgm 3658; 3659; GFX1032-LABEL: min_i32_varying: 3660; GFX1032: ; %bb.0: ; %entry 3661; GFX1032-NEXT: v_mov_b32_e32 v2, v0 3662; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3663; GFX1032-NEXT: v_bfrev_b32_e32 v1, -2 3664; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3665; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3666; GFX1032-NEXT: v_mov_b32_e32 v2, v1 3667; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3668; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3669; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3670; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3671; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3672; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3673; GFX1032-NEXT: v_mov_b32_e32 v3, v2 3674; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 3675; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3676; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3677; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3678; GFX1032-NEXT: v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3679; GFX1032-NEXT: v_readlane_b32 s3, v2, 15 3680; GFX1032-NEXT: v_readlane_b32 s4, v2, 31 3681; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3682; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3683; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3684; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3685; GFX1032-NEXT: v_writelane_b32 v1, s3, 16 3686; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3687; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3688; GFX1032-NEXT: s_mov_b32 s2, -1 3689; GFX1032-NEXT: ; implicit-def: $vgpr0 3690; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 3691; GFX1032-NEXT: s_cbranch_execz BB19_2 3692; GFX1032-NEXT: ; %bb.1: 3693; GFX1032-NEXT: v_mov_b32_e32 v0, 0 3694; GFX1032-NEXT: v_mov_b32_e32 v4, s4 3695; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3696; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3697; GFX1032-NEXT: ds_min_rtn_i32 v0, v0, v4 3698; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3699; GFX1032-NEXT: buffer_gl0_inv 3700; GFX1032-NEXT: BB19_2: 3701; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3702; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 3703; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 3704; GFX1032-NEXT: v_mov_b32_e32 v0, v1 3705; GFX1032-NEXT: v_min_i32_e32 v0, s3, v0 3706; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3707; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3708; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 3709; GFX1032-NEXT: s_endpgm 3710entry: 3711 %lane = call i32 @llvm.amdgcn.workitem.id.x() 3712 %old = atomicrmw min i32 addrspace(3)* @local_var32, i32 %lane acq_rel 3713 store i32 %old, i32 addrspace(1)* %out 3714 ret void 3715} 3716 3717define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { 3718; 3719; 3720; GFX7LESS-LABEL: min_i64_constant: 3721; GFX7LESS: ; %bb.0: ; %entry 3722; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3723; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3724; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 3725; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3726; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 3727; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 3728; GFX7LESS-NEXT: s_cbranch_execz BB20_2 3729; GFX7LESS-NEXT: ; %bb.1: 3730; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 3731; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 3732; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 3733; GFX7LESS-NEXT: s_mov_b32 m0, -1 3734; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3735; GFX7LESS-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 3736; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3737; GFX7LESS-NEXT: BB20_2: 3738; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 3739; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3740; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 3741; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 3742; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, -2 3743; GFX7LESS-NEXT: s_mov_b32 s2, -1 3744; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 3745; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 3746; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 3747; GFX7LESS-NEXT: v_mov_b32_e32 v3, s4 3748; GFX7LESS-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 3749; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3750; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 3751; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3752; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3753; GFX7LESS-NEXT: s_endpgm 3754; 3755; GFX8-LABEL: min_i64_constant: 3756; GFX8: ; %bb.0: ; %entry 3757; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3758; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3759; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3760; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3761; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 3762; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 3763; GFX8-NEXT: s_cbranch_execz BB20_2 3764; GFX8-NEXT: ; %bb.1: 3765; GFX8-NEXT: v_mov_b32_e32 v0, 5 3766; GFX8-NEXT: v_mov_b32_e32 v2, 0 3767; GFX8-NEXT: v_mov_b32_e32 v1, 0 3768; GFX8-NEXT: s_mov_b32 m0, -1 3769; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3770; GFX8-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 3771; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3772; GFX8-NEXT: BB20_2: 3773; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 3774; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3775; GFX8-NEXT: v_readfirstlane_b32 s4, v0 3776; GFX8-NEXT: v_bfrev_b32_e32 v0, -2 3777; GFX8-NEXT: v_readfirstlane_b32 s5, v1 3778; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 3779; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 3780; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 3781; GFX8-NEXT: v_mov_b32_e32 v2, s5 3782; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3783; GFX8-NEXT: v_mov_b32_e32 v2, s4 3784; GFX8-NEXT: s_mov_b32 s2, -1 3785; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 3786; GFX8-NEXT: s_mov_b32 s3, 0xf000 3787; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3788; GFX8-NEXT: s_endpgm 3789; 3790; GFX9-LABEL: min_i64_constant: 3791; GFX9: ; %bb.0: ; %entry 3792; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3793; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3794; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3795; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3796; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 3797; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 3798; GFX9-NEXT: s_cbranch_execz BB20_2 3799; GFX9-NEXT: ; %bb.1: 3800; GFX9-NEXT: v_mov_b32_e32 v0, 5 3801; GFX9-NEXT: v_mov_b32_e32 v1, 0 3802; GFX9-NEXT: v_mov_b32_e32 v2, 0 3803; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3804; GFX9-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 3805; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3806; GFX9-NEXT: BB20_2: 3807; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 3808; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3809; GFX9-NEXT: v_readfirstlane_b32 s4, v0 3810; GFX9-NEXT: v_bfrev_b32_e32 v0, -2 3811; GFX9-NEXT: v_readfirstlane_b32 s5, v1 3812; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 3813; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 3814; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 3815; GFX9-NEXT: v_mov_b32_e32 v2, s5 3816; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3817; GFX9-NEXT: v_mov_b32_e32 v2, s4 3818; GFX9-NEXT: s_mov_b32 s2, -1 3819; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 3820; GFX9-NEXT: s_mov_b32 s3, 0xf000 3821; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3822; GFX9-NEXT: s_endpgm 3823; 3824; GFX1064-LABEL: min_i64_constant: 3825; GFX1064: ; %bb.0: ; %entry 3826; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3827; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3828; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3829; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3830; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 3831; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 3832; GFX1064-NEXT: s_cbranch_execz BB20_2 3833; GFX1064-NEXT: ; %bb.1: 3834; GFX1064-NEXT: v_mov_b32_e32 v0, 5 3835; GFX1064-NEXT: v_mov_b32_e32 v1, 0 3836; GFX1064-NEXT: v_mov_b32_e32 v2, 0 3837; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3838; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3839; GFX1064-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 3840; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3841; GFX1064-NEXT: buffer_gl0_inv 3842; GFX1064-NEXT: BB20_2: 3843; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3844; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 3845; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 3846; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 3847; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc 3848; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 3849; GFX1064-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] 3850; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 3851; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 3852; GFX1064-NEXT: s_mov_b32 s2, -1 3853; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3854; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3855; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3856; GFX1064-NEXT: s_endpgm 3857; 3858; GFX1032-LABEL: min_i64_constant: 3859; GFX1032: ; %bb.0: ; %entry 3860; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3861; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3862; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3863; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 3864; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 3865; GFX1032-NEXT: s_cbranch_execz BB20_2 3866; GFX1032-NEXT: ; %bb.1: 3867; GFX1032-NEXT: v_mov_b32_e32 v0, 5 3868; GFX1032-NEXT: v_mov_b32_e32 v1, 0 3869; GFX1032-NEXT: v_mov_b32_e32 v2, 0 3870; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3871; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3872; GFX1032-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 3873; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3874; GFX1032-NEXT: buffer_gl0_inv 3875; GFX1032-NEXT: BB20_2: 3876; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3877; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 3878; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 3879; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 3880; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo 3881; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo 3882; GFX1032-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1] 3883; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 3884; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 3885; GFX1032-NEXT: s_mov_b32 s2, -1 3886; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3887; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3888; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3889; GFX1032-NEXT: s_endpgm 3890entry: 3891 %old = atomicrmw min i64 addrspace(3)* @local_var64, i64 5 acq_rel 3892 store i64 %old, i64 addrspace(1)* %out 3893 ret void 3894} 3895 3896define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { 3897; 3898; 3899; GFX7LESS-LABEL: umax_i32_varying: 3900; GFX7LESS: ; %bb.0: ; %entry 3901; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3902; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 3903; GFX7LESS-NEXT: s_mov_b32 m0, -1 3904; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3905; GFX7LESS-NEXT: ds_max_rtn_u32 v0, v1, v0 3906; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3907; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3908; GFX7LESS-NEXT: s_mov_b32 s2, -1 3909; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 3910; GFX7LESS-NEXT: s_endpgm 3911; 3912; GFX8-LABEL: umax_i32_varying: 3913; GFX8: ; %bb.0: ; %entry 3914; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3915; GFX8-NEXT: v_mov_b32_e32 v2, v0 3916; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3917; GFX8-NEXT: v_mov_b32_e32 v1, 0 3918; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3919; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3920; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3921; GFX8-NEXT: s_not_b64 exec, exec 3922; GFX8-NEXT: v_mov_b32_e32 v2, 0 3923; GFX8-NEXT: s_not_b64 exec, exec 3924; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3925; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 3926; GFX8-NEXT: s_nop 1 3927; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 3928; GFX8-NEXT: s_nop 1 3929; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 3930; GFX8-NEXT: s_nop 1 3931; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 3932; GFX8-NEXT: s_nop 1 3933; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3934; GFX8-NEXT: s_nop 1 3935; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3936; GFX8-NEXT: v_readlane_b32 s4, v2, 63 3937; GFX8-NEXT: s_nop 0 3938; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3939; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3940; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3941; GFX8-NEXT: ; implicit-def: $vgpr0 3942; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 3943; GFX8-NEXT: s_cbranch_execz BB21_2 3944; GFX8-NEXT: ; %bb.1: 3945; GFX8-NEXT: v_mov_b32_e32 v0, 0 3946; GFX8-NEXT: v_mov_b32_e32 v3, s4 3947; GFX8-NEXT: s_mov_b32 m0, -1 3948; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3949; GFX8-NEXT: ds_max_rtn_u32 v0, v0, v3 3950; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3951; GFX8-NEXT: BB21_2: 3952; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 3953; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3954; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3955; GFX8-NEXT: v_mov_b32_e32 v0, v1 3956; GFX8-NEXT: v_max_u32_e32 v0, s2, v0 3957; GFX8-NEXT: s_mov_b32 s3, 0xf000 3958; GFX8-NEXT: s_mov_b32 s2, -1 3959; GFX8-NEXT: s_nop 0 3960; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 3961; GFX8-NEXT: s_endpgm 3962; 3963; GFX9-LABEL: umax_i32_varying: 3964; GFX9: ; %bb.0: ; %entry 3965; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3966; GFX9-NEXT: v_mov_b32_e32 v2, v0 3967; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3968; GFX9-NEXT: v_mov_b32_e32 v1, 0 3969; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3970; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3971; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3972; GFX9-NEXT: s_not_b64 exec, exec 3973; GFX9-NEXT: v_mov_b32_e32 v2, 0 3974; GFX9-NEXT: s_not_b64 exec, exec 3975; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3976; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 3977; GFX9-NEXT: s_nop 1 3978; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 3979; GFX9-NEXT: s_nop 1 3980; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 3981; GFX9-NEXT: s_nop 1 3982; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 3983; GFX9-NEXT: s_nop 1 3984; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3985; GFX9-NEXT: s_nop 1 3986; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3987; GFX9-NEXT: v_readlane_b32 s4, v2, 63 3988; GFX9-NEXT: s_nop 0 3989; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3990; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3991; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3992; GFX9-NEXT: ; implicit-def: $vgpr0 3993; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 3994; GFX9-NEXT: s_cbranch_execz BB21_2 3995; GFX9-NEXT: ; %bb.1: 3996; GFX9-NEXT: v_mov_b32_e32 v0, 0 3997; GFX9-NEXT: v_mov_b32_e32 v3, s4 3998; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3999; GFX9-NEXT: ds_max_rtn_u32 v0, v0, v3 4000; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4001; GFX9-NEXT: BB21_2: 4002; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4003; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4004; GFX9-NEXT: v_readfirstlane_b32 s2, v0 4005; GFX9-NEXT: v_mov_b32_e32 v0, v1 4006; GFX9-NEXT: v_max_u32_e32 v0, s2, v0 4007; GFX9-NEXT: s_mov_b32 s3, 0xf000 4008; GFX9-NEXT: s_mov_b32 s2, -1 4009; GFX9-NEXT: s_nop 0 4010; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 4011; GFX9-NEXT: s_endpgm 4012; 4013; GFX1064-LABEL: umax_i32_varying: 4014; GFX1064: ; %bb.0: ; %entry 4015; GFX1064-NEXT: v_mov_b32_e32 v1, v0 4016; GFX1064-NEXT: s_not_b64 exec, exec 4017; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4018; GFX1064-NEXT: s_not_b64 exec, exec 4019; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4020; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4021; GFX1064-NEXT: v_mov_b32_e32 v3, 0 4022; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4023; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4024; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4025; GFX1064-NEXT: v_mov_b32_e32 v2, v1 4026; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4027; GFX1064-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4028; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 4029; GFX1064-NEXT: v_mov_b32_e32 v2, s4 4030; GFX1064-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 4031; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 4032; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4033; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4034; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4035; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4036; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 4037; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 4038; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4039; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4040; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4041; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 4042; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 4043; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 4044; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4045; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4046; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 4047; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 4048; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 4049; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4050; GFX1064-NEXT: s_mov_b32 s2, -1 4051; GFX1064-NEXT: ; implicit-def: $vgpr0 4052; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 4053; GFX1064-NEXT: s_cbranch_execz BB21_2 4054; GFX1064-NEXT: ; %bb.1: 4055; GFX1064-NEXT: v_mov_b32_e32 v0, 0 4056; GFX1064-NEXT: v_mov_b32_e32 v4, s7 4057; GFX1064-NEXT: s_mov_b32 s3, s7 4058; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4059; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4060; GFX1064-NEXT: ds_max_rtn_u32 v0, v0, v4 4061; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4062; GFX1064-NEXT: buffer_gl0_inv 4063; GFX1064-NEXT: BB21_2: 4064; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 4065; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 4066; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 4067; GFX1064-NEXT: v_mov_b32_e32 v0, v3 4068; GFX1064-NEXT: v_max_u32_e32 v0, s3, v0 4069; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4070; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4071; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 4072; GFX1064-NEXT: s_endpgm 4073; 4074; GFX1032-LABEL: umax_i32_varying: 4075; GFX1032: ; %bb.0: ; %entry 4076; GFX1032-NEXT: v_mov_b32_e32 v1, v0 4077; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4078; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4079; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4080; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4081; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4082; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4083; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4084; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4085; GFX1032-NEXT: v_mov_b32_e32 v2, v1 4086; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4087; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4088; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4089; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4090; GFX1032-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4091; GFX1032-NEXT: v_mov_b32_e32 v3, 0 4092; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 4093; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 4094; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4095; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4096; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4097; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4098; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 4099; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4100; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4101; GFX1032-NEXT: s_mov_b32 s2, -1 4102; GFX1032-NEXT: ; implicit-def: $vgpr0 4103; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 4104; GFX1032-NEXT: s_cbranch_execz BB21_2 4105; GFX1032-NEXT: ; %bb.1: 4106; GFX1032-NEXT: v_mov_b32_e32 v0, 0 4107; GFX1032-NEXT: v_mov_b32_e32 v4, s4 4108; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4109; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4110; GFX1032-NEXT: ds_max_rtn_u32 v0, v0, v4 4111; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4112; GFX1032-NEXT: buffer_gl0_inv 4113; GFX1032-NEXT: BB21_2: 4114; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 4115; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 4116; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 4117; GFX1032-NEXT: v_mov_b32_e32 v0, v3 4118; GFX1032-NEXT: v_max_u32_e32 v0, s3, v0 4119; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4120; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4121; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 4122; GFX1032-NEXT: s_endpgm 4123entry: 4124 %lane = call i32 @llvm.amdgcn.workitem.id.x() 4125 %old = atomicrmw umax i32 addrspace(3)* @local_var32, i32 %lane acq_rel 4126 store i32 %old, i32 addrspace(1)* %out 4127 ret void 4128} 4129 4130define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { 4131; 4132; 4133; GFX7LESS-LABEL: umax_i64_constant: 4134; GFX7LESS: ; %bb.0: ; %entry 4135; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4136; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4137; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 4138; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4139; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 4140; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 4141; GFX7LESS-NEXT: s_cbranch_execz BB22_2 4142; GFX7LESS-NEXT: ; %bb.1: 4143; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 4144; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 4145; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4146; GFX7LESS-NEXT: s_mov_b32 m0, -1 4147; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4148; GFX7LESS-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4149; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4150; GFX7LESS-NEXT: BB22_2: 4151; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 4152; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4153; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 4154; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 4155; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4156; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4157; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4158; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 4159; GFX7LESS-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1] 4160; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4161; GFX7LESS-NEXT: v_mov_b32_e32 v1, s5 4162; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 4163; GFX7LESS-NEXT: s_mov_b32 s2, -1 4164; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4165; GFX7LESS-NEXT: s_endpgm 4166; 4167; GFX8-LABEL: umax_i64_constant: 4168; GFX8: ; %bb.0: ; %entry 4169; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4170; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4171; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4172; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4173; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 4174; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 4175; GFX8-NEXT: s_cbranch_execz BB22_2 4176; GFX8-NEXT: ; %bb.1: 4177; GFX8-NEXT: v_mov_b32_e32 v0, 5 4178; GFX8-NEXT: v_mov_b32_e32 v2, 0 4179; GFX8-NEXT: v_mov_b32_e32 v1, 0 4180; GFX8-NEXT: s_mov_b32 m0, -1 4181; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4182; GFX8-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4183; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4184; GFX8-NEXT: BB22_2: 4185; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 4186; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4187; GFX8-NEXT: v_readfirstlane_b32 s2, v0 4188; GFX8-NEXT: v_readfirstlane_b32 s3, v1 4189; GFX8-NEXT: v_mov_b32_e32 v1, 0 4190; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4191; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] 4192; GFX8-NEXT: v_mov_b32_e32 v1, s3 4193; GFX8-NEXT: v_mov_b32_e32 v2, s2 4194; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4195; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 4196; GFX8-NEXT: s_mov_b32 s3, 0xf000 4197; GFX8-NEXT: s_mov_b32 s2, -1 4198; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4199; GFX8-NEXT: s_endpgm 4200; 4201; GFX9-LABEL: umax_i64_constant: 4202; GFX9: ; %bb.0: ; %entry 4203; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4204; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4205; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4206; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4207; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 4208; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4209; GFX9-NEXT: s_cbranch_execz BB22_2 4210; GFX9-NEXT: ; %bb.1: 4211; GFX9-NEXT: v_mov_b32_e32 v0, 5 4212; GFX9-NEXT: v_mov_b32_e32 v1, 0 4213; GFX9-NEXT: v_mov_b32_e32 v2, 0 4214; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4215; GFX9-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4216; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4217; GFX9-NEXT: BB22_2: 4218; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4219; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4220; GFX9-NEXT: v_readfirstlane_b32 s2, v0 4221; GFX9-NEXT: v_readfirstlane_b32 s3, v1 4222; GFX9-NEXT: v_mov_b32_e32 v1, 0 4223; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4224; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] 4225; GFX9-NEXT: v_mov_b32_e32 v1, s3 4226; GFX9-NEXT: v_mov_b32_e32 v2, s2 4227; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4228; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 4229; GFX9-NEXT: s_mov_b32 s3, 0xf000 4230; GFX9-NEXT: s_mov_b32 s2, -1 4231; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4232; GFX9-NEXT: s_endpgm 4233; 4234; GFX1064-LABEL: umax_i64_constant: 4235; GFX1064: ; %bb.0: ; %entry 4236; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4237; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4238; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4239; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4240; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 4241; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 4242; GFX1064-NEXT: s_cbranch_execz BB22_2 4243; GFX1064-NEXT: ; %bb.1: 4244; GFX1064-NEXT: v_mov_b32_e32 v0, 5 4245; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4246; GFX1064-NEXT: v_mov_b32_e32 v2, 0 4247; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4248; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4249; GFX1064-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4250; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4251; GFX1064-NEXT: buffer_gl0_inv 4252; GFX1064-NEXT: BB22_2: 4253; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 4254; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 4255; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 4256; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 4257; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4258; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4259; GFX1064-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] 4260; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 4261; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc 4262; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4263; GFX1064-NEXT: s_mov_b32 s2, -1 4264; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4265; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4266; GFX1064-NEXT: s_endpgm 4267; 4268; GFX1032-LABEL: umax_i64_constant: 4269; GFX1032: ; %bb.0: ; %entry 4270; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4271; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4272; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4273; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 4274; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 4275; GFX1032-NEXT: s_cbranch_execz BB22_2 4276; GFX1032-NEXT: ; %bb.1: 4277; GFX1032-NEXT: v_mov_b32_e32 v0, 5 4278; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4279; GFX1032-NEXT: v_mov_b32_e32 v2, 0 4280; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4281; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4282; GFX1032-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4283; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4284; GFX1032-NEXT: buffer_gl0_inv 4285; GFX1032-NEXT: BB22_2: 4286; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 4287; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 4288; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 4289; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 4290; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4291; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo 4292; GFX1032-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1] 4293; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 4294; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc_lo 4295; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4296; GFX1032-NEXT: s_mov_b32 s2, -1 4297; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4298; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4299; GFX1032-NEXT: s_endpgm 4300entry: 4301 %old = atomicrmw umax i64 addrspace(3)* @local_var64, i64 5 acq_rel 4302 store i64 %old, i64 addrspace(1)* %out 4303 ret void 4304} 4305 4306define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { 4307; 4308; 4309; GFX7LESS-LABEL: umin_i32_varying: 4310; GFX7LESS: ; %bb.0: ; %entry 4311; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4312; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4313; GFX7LESS-NEXT: s_mov_b32 m0, -1 4314; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4315; GFX7LESS-NEXT: ds_min_rtn_u32 v0, v1, v0 4316; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4317; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4318; GFX7LESS-NEXT: s_mov_b32 s2, -1 4319; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 4320; GFX7LESS-NEXT: s_endpgm 4321; 4322; GFX8-LABEL: umin_i32_varying: 4323; GFX8: ; %bb.0: ; %entry 4324; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4325; GFX8-NEXT: v_mov_b32_e32 v2, v0 4326; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4327; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4328; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 4329; GFX8-NEXT: v_mov_b32_e32 v1, -1 4330; GFX8-NEXT: s_mov_b64 exec, s[2:3] 4331; GFX8-NEXT: s_not_b64 exec, exec 4332; GFX8-NEXT: v_mov_b32_e32 v2, -1 4333; GFX8-NEXT: s_not_b64 exec, exec 4334; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 4335; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4336; GFX8-NEXT: s_nop 1 4337; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 4338; GFX8-NEXT: s_nop 1 4339; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 4340; GFX8-NEXT: s_nop 1 4341; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 4342; GFX8-NEXT: s_nop 1 4343; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4344; GFX8-NEXT: s_nop 1 4345; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4346; GFX8-NEXT: v_readlane_b32 s4, v2, 63 4347; GFX8-NEXT: s_nop 0 4348; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4349; GFX8-NEXT: s_mov_b64 exec, s[2:3] 4350; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4351; GFX8-NEXT: ; implicit-def: $vgpr0 4352; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 4353; GFX8-NEXT: s_cbranch_execz BB23_2 4354; GFX8-NEXT: ; %bb.1: 4355; GFX8-NEXT: v_mov_b32_e32 v0, 0 4356; GFX8-NEXT: v_mov_b32_e32 v3, s4 4357; GFX8-NEXT: s_mov_b32 m0, -1 4358; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4359; GFX8-NEXT: ds_min_rtn_u32 v0, v0, v3 4360; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4361; GFX8-NEXT: BB23_2: 4362; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 4363; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4364; GFX8-NEXT: v_readfirstlane_b32 s2, v0 4365; GFX8-NEXT: v_mov_b32_e32 v0, v1 4366; GFX8-NEXT: v_min_u32_e32 v0, s2, v0 4367; GFX8-NEXT: s_mov_b32 s3, 0xf000 4368; GFX8-NEXT: s_mov_b32 s2, -1 4369; GFX8-NEXT: s_nop 0 4370; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 4371; GFX8-NEXT: s_endpgm 4372; 4373; GFX9-LABEL: umin_i32_varying: 4374; GFX9: ; %bb.0: ; %entry 4375; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4376; GFX9-NEXT: v_mov_b32_e32 v2, v0 4377; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4378; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4379; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 4380; GFX9-NEXT: v_mov_b32_e32 v1, -1 4381; GFX9-NEXT: s_mov_b64 exec, s[2:3] 4382; GFX9-NEXT: s_not_b64 exec, exec 4383; GFX9-NEXT: v_mov_b32_e32 v2, -1 4384; GFX9-NEXT: s_not_b64 exec, exec 4385; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 4386; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4387; GFX9-NEXT: s_nop 1 4388; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 4389; GFX9-NEXT: s_nop 1 4390; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 4391; GFX9-NEXT: s_nop 1 4392; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 4393; GFX9-NEXT: s_nop 1 4394; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4395; GFX9-NEXT: s_nop 1 4396; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4397; GFX9-NEXT: v_readlane_b32 s4, v2, 63 4398; GFX9-NEXT: s_nop 0 4399; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4400; GFX9-NEXT: s_mov_b64 exec, s[2:3] 4401; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4402; GFX9-NEXT: ; implicit-def: $vgpr0 4403; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4404; GFX9-NEXT: s_cbranch_execz BB23_2 4405; GFX9-NEXT: ; %bb.1: 4406; GFX9-NEXT: v_mov_b32_e32 v0, 0 4407; GFX9-NEXT: v_mov_b32_e32 v3, s4 4408; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4409; GFX9-NEXT: ds_min_rtn_u32 v0, v0, v3 4410; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4411; GFX9-NEXT: BB23_2: 4412; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4413; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4414; GFX9-NEXT: v_readfirstlane_b32 s2, v0 4415; GFX9-NEXT: v_mov_b32_e32 v0, v1 4416; GFX9-NEXT: v_min_u32_e32 v0, s2, v0 4417; GFX9-NEXT: s_mov_b32 s3, 0xf000 4418; GFX9-NEXT: s_mov_b32 s2, -1 4419; GFX9-NEXT: s_nop 0 4420; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 4421; GFX9-NEXT: s_endpgm 4422; 4423; GFX1064-LABEL: umin_i32_varying: 4424; GFX1064: ; %bb.0: ; %entry 4425; GFX1064-NEXT: v_mov_b32_e32 v1, v0 4426; GFX1064-NEXT: s_not_b64 exec, exec 4427; GFX1064-NEXT: v_mov_b32_e32 v1, -1 4428; GFX1064-NEXT: s_not_b64 exec, exec 4429; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4430; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4431; GFX1064-NEXT: v_mov_b32_e32 v3, -1 4432; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 4433; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 4434; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 4435; GFX1064-NEXT: v_mov_b32_e32 v2, v1 4436; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4437; GFX1064-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4438; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 4439; GFX1064-NEXT: v_mov_b32_e32 v2, s4 4440; GFX1064-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 4441; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 4442; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4443; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4444; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4445; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4446; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 4447; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 4448; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4449; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4450; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4451; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 4452; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 4453; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 4454; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4455; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4456; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 4457; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 4458; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 4459; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4460; GFX1064-NEXT: s_mov_b32 s2, -1 4461; GFX1064-NEXT: ; implicit-def: $vgpr0 4462; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 4463; GFX1064-NEXT: s_cbranch_execz BB23_2 4464; GFX1064-NEXT: ; %bb.1: 4465; GFX1064-NEXT: v_mov_b32_e32 v0, 0 4466; GFX1064-NEXT: v_mov_b32_e32 v4, s7 4467; GFX1064-NEXT: s_mov_b32 s3, s7 4468; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4469; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4470; GFX1064-NEXT: ds_min_rtn_u32 v0, v0, v4 4471; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4472; GFX1064-NEXT: buffer_gl0_inv 4473; GFX1064-NEXT: BB23_2: 4474; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 4475; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 4476; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 4477; GFX1064-NEXT: v_mov_b32_e32 v0, v3 4478; GFX1064-NEXT: v_min_u32_e32 v0, s3, v0 4479; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4480; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4481; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 4482; GFX1064-NEXT: s_endpgm 4483; 4484; GFX1032-LABEL: umin_i32_varying: 4485; GFX1032: ; %bb.0: ; %entry 4486; GFX1032-NEXT: v_mov_b32_e32 v1, v0 4487; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4488; GFX1032-NEXT: v_mov_b32_e32 v1, -1 4489; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4490; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4491; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4492; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 4493; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 4494; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 4495; GFX1032-NEXT: v_mov_b32_e32 v2, v1 4496; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4497; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4498; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4499; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4500; GFX1032-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4501; GFX1032-NEXT: v_mov_b32_e32 v3, -1 4502; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 4503; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 4504; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4505; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4506; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4507; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4508; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 4509; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4510; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4511; GFX1032-NEXT: s_mov_b32 s2, -1 4512; GFX1032-NEXT: ; implicit-def: $vgpr0 4513; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 4514; GFX1032-NEXT: s_cbranch_execz BB23_2 4515; GFX1032-NEXT: ; %bb.1: 4516; GFX1032-NEXT: v_mov_b32_e32 v0, 0 4517; GFX1032-NEXT: v_mov_b32_e32 v4, s4 4518; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4519; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4520; GFX1032-NEXT: ds_min_rtn_u32 v0, v0, v4 4521; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4522; GFX1032-NEXT: buffer_gl0_inv 4523; GFX1032-NEXT: BB23_2: 4524; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 4525; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 4526; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 4527; GFX1032-NEXT: v_mov_b32_e32 v0, v3 4528; GFX1032-NEXT: v_min_u32_e32 v0, s3, v0 4529; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4530; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4531; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 4532; GFX1032-NEXT: s_endpgm 4533entry: 4534 %lane = call i32 @llvm.amdgcn.workitem.id.x() 4535 %old = atomicrmw umin i32 addrspace(3)* @local_var32, i32 %lane acq_rel 4536 store i32 %old, i32 addrspace(1)* %out 4537 ret void 4538} 4539 4540define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { 4541; 4542; 4543; GFX7LESS-LABEL: umin_i64_constant: 4544; GFX7LESS: ; %bb.0: ; %entry 4545; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4546; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4547; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 4548; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4549; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 4550; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 4551; GFX7LESS-NEXT: s_cbranch_execz BB24_2 4552; GFX7LESS-NEXT: ; %bb.1: 4553; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 4554; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 4555; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4556; GFX7LESS-NEXT: s_mov_b32 m0, -1 4557; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4558; GFX7LESS-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 4559; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4560; GFX7LESS-NEXT: BB24_2: 4561; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 4562; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4563; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 4564; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 4565; GFX7LESS-NEXT: s_mov_b32 s2, -1 4566; GFX7LESS-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 4567; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4568; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 4569; GFX7LESS-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 4570; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4571; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 4572; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4573; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4574; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4575; GFX7LESS-NEXT: s_endpgm 4576; 4577; GFX8-LABEL: umin_i64_constant: 4578; GFX8: ; %bb.0: ; %entry 4579; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4580; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4581; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4582; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4583; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 4584; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 4585; GFX8-NEXT: s_cbranch_execz BB24_2 4586; GFX8-NEXT: ; %bb.1: 4587; GFX8-NEXT: v_mov_b32_e32 v0, 5 4588; GFX8-NEXT: v_mov_b32_e32 v2, 0 4589; GFX8-NEXT: v_mov_b32_e32 v1, 0 4590; GFX8-NEXT: s_mov_b32 m0, -1 4591; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4592; GFX8-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 4593; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4594; GFX8-NEXT: BB24_2: 4595; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 4596; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4597; GFX8-NEXT: v_readfirstlane_b32 s5, v1 4598; GFX8-NEXT: v_readfirstlane_b32 s4, v0 4599; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 4600; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4601; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 4602; GFX8-NEXT: v_mov_b32_e32 v2, s5 4603; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4604; GFX8-NEXT: v_mov_b32_e32 v2, s4 4605; GFX8-NEXT: s_mov_b32 s2, -1 4606; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4607; GFX8-NEXT: s_mov_b32 s3, 0xf000 4608; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4609; GFX8-NEXT: s_endpgm 4610; 4611; GFX9-LABEL: umin_i64_constant: 4612; GFX9: ; %bb.0: ; %entry 4613; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4614; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4615; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4616; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4617; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 4618; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4619; GFX9-NEXT: s_cbranch_execz BB24_2 4620; GFX9-NEXT: ; %bb.1: 4621; GFX9-NEXT: v_mov_b32_e32 v0, 5 4622; GFX9-NEXT: v_mov_b32_e32 v1, 0 4623; GFX9-NEXT: v_mov_b32_e32 v2, 0 4624; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4625; GFX9-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 4626; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4627; GFX9-NEXT: BB24_2: 4628; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4629; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4630; GFX9-NEXT: v_readfirstlane_b32 s5, v1 4631; GFX9-NEXT: v_readfirstlane_b32 s4, v0 4632; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 4633; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4634; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 4635; GFX9-NEXT: v_mov_b32_e32 v2, s5 4636; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4637; GFX9-NEXT: v_mov_b32_e32 v2, s4 4638; GFX9-NEXT: s_mov_b32 s2, -1 4639; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4640; GFX9-NEXT: s_mov_b32 s3, 0xf000 4641; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4642; GFX9-NEXT: s_endpgm 4643; 4644; GFX1064-LABEL: umin_i64_constant: 4645; GFX1064: ; %bb.0: ; %entry 4646; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4647; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4648; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4649; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4650; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 4651; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 4652; GFX1064-NEXT: s_cbranch_execz BB24_2 4653; GFX1064-NEXT: ; %bb.1: 4654; GFX1064-NEXT: v_mov_b32_e32 v0, 5 4655; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4656; GFX1064-NEXT: v_mov_b32_e32 v2, 0 4657; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4658; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4659; GFX1064-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 4660; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4661; GFX1064-NEXT: buffer_gl0_inv 4662; GFX1064-NEXT: BB24_2: 4663; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 4664; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 4665; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 4666; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 4667; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 4668; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4669; GFX1064-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] 4670; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 4671; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 4672; GFX1064-NEXT: s_mov_b32 s2, -1 4673; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4674; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4675; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4676; GFX1064-NEXT: s_endpgm 4677; 4678; GFX1032-LABEL: umin_i64_constant: 4679; GFX1032: ; %bb.0: ; %entry 4680; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4681; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4682; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4683; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 4684; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 4685; GFX1032-NEXT: s_cbranch_execz BB24_2 4686; GFX1032-NEXT: ; %bb.1: 4687; GFX1032-NEXT: v_mov_b32_e32 v0, 5 4688; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4689; GFX1032-NEXT: v_mov_b32_e32 v2, 0 4690; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4691; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4692; GFX1032-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 4693; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4694; GFX1032-NEXT: buffer_gl0_inv 4695; GFX1032-NEXT: BB24_2: 4696; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 4697; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 4698; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 4699; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 4700; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo 4701; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo 4702; GFX1032-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1] 4703; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 4704; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 4705; GFX1032-NEXT: s_mov_b32 s2, -1 4706; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4707; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4708; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4709; GFX1032-NEXT: s_endpgm 4710entry: 4711 %old = atomicrmw umin i64 addrspace(3)* @local_var64, i64 5 acq_rel 4712 store i64 %old, i64 addrspace(1)* %out 4713 ret void 4714} 4715