1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-- -amdgpu-atomic-optimizations=true -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX7 %s 3; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX8 %s 4; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX9 %s 5; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s 6; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s 7 8declare i1 @llvm.amdgcn.wqm.vote(i1) 9declare i32 @llvm.amdgcn.raw.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32 immarg) 10declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32 immarg) 11 12; Show what the atomic optimization pass will do for raw buffers. 13 14define amdgpu_ps void @add_i32_constant(<4 x i32> inreg %out, <4 x i32> inreg %inout) { 15; GFX7-LABEL: add_i32_constant: 16; GFX7: ; %bb.0: ; %entry 17; GFX7-NEXT: s_mov_b64 s[10:11], exec 18; GFX7-NEXT: ; implicit-def: $vgpr0 19; GFX7-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] 20; GFX7-NEXT: s_cbranch_execz BB0_4 21; GFX7-NEXT: ; %bb.1: 22; GFX7-NEXT: s_mov_b64 s[12:13], exec 23; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s12, 0 24; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s13, v0 25; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 26; GFX7-NEXT: ; implicit-def: $vgpr1 27; GFX7-NEXT: s_and_saveexec_b64 s[10:11], vcc 28; GFX7-NEXT: s_cbranch_execz BB0_3 29; GFX7-NEXT: ; %bb.2: 30; GFX7-NEXT: s_bcnt1_i32_b64 s12, s[12:13] 31; GFX7-NEXT: s_mul_i32 s12, s12, 5 32; GFX7-NEXT: v_mov_b32_e32 v1, s12 33; GFX7-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc 34; GFX7-NEXT: BB0_3: 35; GFX7-NEXT: s_or_b64 exec, exec, s[10:11] 36; GFX7-NEXT: s_waitcnt vmcnt(0) 37; GFX7-NEXT: v_readfirstlane_b32 s4, v1 38; GFX7-NEXT: v_mad_u32_u24 v0, v0, 5, s4 39; GFX7-NEXT: BB0_4: ; %Flow 40; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] 41; GFX7-NEXT: s_wqm_b64 s[4:5], -1 42; GFX7-NEXT: s_andn2_b64 vcc, exec, s[4:5] 43; GFX7-NEXT: s_cbranch_vccnz BB0_6 44; GFX7-NEXT: ; %bb.5: ; %if 45; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 46; GFX7-NEXT: BB0_6: ; %UnifiedReturnBlock 47; GFX7-NEXT: s_endpgm 48; 49; GFX89-LABEL: add_i32_constant: 50; GFX89: ; %bb.0: ; %entry 51; GFX89-NEXT: s_mov_b64 s[10:11], exec 52; GFX89-NEXT: ; implicit-def: $vgpr0 53; GFX89-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] 54; GFX89-NEXT: s_cbranch_execz BB0_4 55; GFX89-NEXT: ; %bb.1: 56; GFX89-NEXT: s_mov_b64 s[12:13], exec 57; GFX89-NEXT: v_mbcnt_lo_u32_b32 v0, s12, 0 58; GFX89-NEXT: v_mbcnt_hi_u32_b32 v0, s13, v0 59; GFX89-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 60; GFX89-NEXT: ; implicit-def: $vgpr1 61; GFX89-NEXT: s_and_saveexec_b64 s[10:11], vcc 62; GFX89-NEXT: s_cbranch_execz BB0_3 63; GFX89-NEXT: ; %bb.2: 64; GFX89-NEXT: s_bcnt1_i32_b64 s12, s[12:13] 65; GFX89-NEXT: s_mul_i32 s12, s12, 5 66; GFX89-NEXT: v_mov_b32_e32 v1, s12 67; GFX89-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc 68; GFX89-NEXT: BB0_3: 69; GFX89-NEXT: s_or_b64 exec, exec, s[10:11] 70; GFX89-NEXT: s_waitcnt vmcnt(0) 71; GFX89-NEXT: v_readfirstlane_b32 s4, v1 72; GFX89-NEXT: v_mad_u32_u24 v0, v0, 5, s4 73; GFX89-NEXT: BB0_4: ; %Flow 74; GFX89-NEXT: s_or_b64 exec, exec, s[8:9] 75; GFX89-NEXT: s_wqm_b64 s[4:5], -1 76; GFX89-NEXT: s_andn2_b64 vcc, exec, s[4:5] 77; GFX89-NEXT: s_cbranch_vccnz BB0_6 78; GFX89-NEXT: ; %bb.5: ; %if 79; GFX89-NEXT: buffer_store_dword v0, off, s[0:3], 0 80; GFX89-NEXT: BB0_6: ; %UnifiedReturnBlock 81; GFX89-NEXT: s_endpgm 82; 83; GFX1064-LABEL: add_i32_constant: 84; GFX1064: ; %bb.0: ; %entry 85; GFX1064-NEXT: s_mov_b64 s[10:11], exec 86; GFX1064-NEXT: ; implicit-def: $vgpr0 87; GFX1064-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] 88; GFX1064-NEXT: s_cbranch_execz BB0_4 89; GFX1064-NEXT: ; %bb.1: 90; GFX1064-NEXT: s_mov_b64 s[12:13], exec 91; GFX1064-NEXT: ; implicit-def: $vgpr1 92; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s12, 0 93; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s13, v0 94; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 95; GFX1064-NEXT: s_and_saveexec_b64 s[10:11], vcc 96; GFX1064-NEXT: s_cbranch_execz BB0_3 97; GFX1064-NEXT: ; %bb.2: 98; GFX1064-NEXT: s_bcnt1_i32_b64 s12, s[12:13] 99; GFX1064-NEXT: s_mul_i32 s12, s12, 5 100; GFX1064-NEXT: v_mov_b32_e32 v1, s12 101; GFX1064-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc 102; GFX1064-NEXT: BB0_3: 103; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 104; GFX1064-NEXT: s_or_b64 exec, exec, s[10:11] 105; GFX1064-NEXT: s_waitcnt vmcnt(0) 106; GFX1064-NEXT: v_readfirstlane_b32 s4, v1 107; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s4 108; GFX1064-NEXT: BB0_4: ; %Flow 109; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9] 110; GFX1064-NEXT: s_wqm_b64 s[4:5], -1 111; GFX1064-NEXT: s_andn2_b64 vcc, exec, s[4:5] 112; GFX1064-NEXT: s_cbranch_vccnz BB0_6 113; GFX1064-NEXT: ; %bb.5: ; %if 114; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 115; GFX1064-NEXT: BB0_6: ; %UnifiedReturnBlock 116; GFX1064-NEXT: s_endpgm 117; 118; GFX1032-LABEL: add_i32_constant: 119; GFX1032: ; %bb.0: ; %entry 120; GFX1032-NEXT: s_mov_b32 s9, exec_lo 121; GFX1032-NEXT: ; implicit-def: $vgpr0 122; GFX1032-NEXT: s_and_saveexec_b32 s8, s9 123; GFX1032-NEXT: s_cbranch_execz BB0_4 124; GFX1032-NEXT: ; %bb.1: 125; GFX1032-NEXT: s_mov_b32 s10, exec_lo 126; GFX1032-NEXT: ; implicit-def: $vgpr1 127; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s10, 0 128; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 129; GFX1032-NEXT: s_and_saveexec_b32 s9, vcc_lo 130; GFX1032-NEXT: s_cbranch_execz BB0_3 131; GFX1032-NEXT: ; %bb.2: 132; GFX1032-NEXT: s_bcnt1_i32_b32 s10, s10 133; GFX1032-NEXT: s_mul_i32 s10, s10, 5 134; GFX1032-NEXT: v_mov_b32_e32 v1, s10 135; GFX1032-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc 136; GFX1032-NEXT: BB0_3: 137; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 138; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s9 139; GFX1032-NEXT: s_waitcnt vmcnt(0) 140; GFX1032-NEXT: v_readfirstlane_b32 s4, v1 141; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s4 142; GFX1032-NEXT: BB0_4: ; %Flow 143; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s8 144; GFX1032-NEXT: s_wqm_b32 s4, -1 145; GFX1032-NEXT: s_andn2_b32 vcc_lo, exec_lo, s4 146; GFX1032-NEXT: s_cbranch_vccnz BB0_6 147; GFX1032-NEXT: ; %bb.5: ; %if 148; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 149; GFX1032-NEXT: BB0_6: ; %UnifiedReturnBlock 150; GFX1032-NEXT: s_endpgm 151entry: 152 %cond1 = call i1 @llvm.amdgcn.wqm.vote(i1 true) 153 %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0) 154 %cond2 = call i1 @llvm.amdgcn.wqm.vote(i1 true) 155 %cond = and i1 %cond1, %cond2 156 br i1 %cond, label %if, label %else 157if: 158 %bitcast = bitcast i32 %old to float 159 call void @llvm.amdgcn.raw.buffer.store.f32(float %bitcast, <4 x i32> %out, i32 0, i32 0, i32 0) 160 ret void 161else: 162 ret void 163} 164 165define amdgpu_ps void @add_i32_varying(<4 x i32> inreg %out, <4 x i32> inreg %inout, i32 %val) { 166; GFX7-LABEL: add_i32_varying: 167; GFX7: ; %bb.0: ; %entry 168; GFX7-NEXT: s_wqm_b64 s[8:9], -1 169; GFX7-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc 170; GFX7-NEXT: s_andn2_b64 vcc, exec, s[8:9] 171; GFX7-NEXT: s_cbranch_vccnz BB1_2 172; GFX7-NEXT: ; %bb.1: ; %if 173; GFX7-NEXT: s_waitcnt vmcnt(0) 174; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 175; GFX7-NEXT: BB1_2: ; %else 176; GFX7-NEXT: s_endpgm 177; 178; GFX8-LABEL: add_i32_varying: 179; GFX8: ; %bb.0: ; %entry 180; GFX8-NEXT: s_mov_b64 s[8:9], exec 181; GFX8-NEXT: s_mov_b64 s[10:11], s[8:9] 182; GFX8-NEXT: v_mov_b32_e32 v2, v0 183; GFX8-NEXT: ; implicit-def: $vgpr0 184; GFX8-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] 185; GFX8-NEXT: s_cbranch_execz BB1_4 186; GFX8-NEXT: ; %bb.1: 187; GFX8-NEXT: s_or_saveexec_b64 s[10:11], -1 188; GFX8-NEXT: v_mov_b32_e32 v1, 0 189; GFX8-NEXT: s_mov_b64 exec, s[10:11] 190; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 191; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 192; GFX8-NEXT: s_not_b64 exec, exec 193; GFX8-NEXT: v_mov_b32_e32 v2, 0 194; GFX8-NEXT: s_not_b64 exec, exec 195; GFX8-NEXT: s_or_saveexec_b64 s[10:11], -1 196; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 197; GFX8-NEXT: s_nop 1 198; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 199; GFX8-NEXT: s_nop 1 200; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 201; GFX8-NEXT: s_nop 1 202; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 203; GFX8-NEXT: s_nop 1 204; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 205; GFX8-NEXT: s_nop 1 206; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 207; GFX8-NEXT: v_readlane_b32 s12, v2, 63 208; GFX8-NEXT: s_nop 0 209; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 210; GFX8-NEXT: s_mov_b64 exec, s[10:11] 211; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 212; GFX8-NEXT: ; implicit-def: $vgpr0 213; GFX8-NEXT: s_and_saveexec_b64 s[10:11], vcc 214; GFX8-NEXT: s_cbranch_execz BB1_3 215; GFX8-NEXT: ; %bb.2: 216; GFX8-NEXT: v_mov_b32_e32 v0, s12 217; GFX8-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc 218; GFX8-NEXT: BB1_3: 219; GFX8-NEXT: s_or_b64 exec, exec, s[10:11] 220; GFX8-NEXT: s_waitcnt vmcnt(0) 221; GFX8-NEXT: v_readfirstlane_b32 s4, v0 222; GFX8-NEXT: v_mov_b32_e32 v0, v1 223; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v0 224; GFX8-NEXT: BB1_4: ; %Flow 225; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] 226; GFX8-NEXT: s_wqm_b64 s[4:5], -1 227; GFX8-NEXT: s_andn2_b64 vcc, exec, s[4:5] 228; GFX8-NEXT: s_cbranch_vccnz BB1_6 229; GFX8-NEXT: ; %bb.5: ; %if 230; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 231; GFX8-NEXT: BB1_6: ; %UnifiedReturnBlock 232; GFX8-NEXT: s_endpgm 233; 234; GFX9-LABEL: add_i32_varying: 235; GFX9: ; %bb.0: ; %entry 236; GFX9-NEXT: s_mov_b64 s[8:9], exec 237; GFX9-NEXT: s_mov_b64 s[10:11], s[8:9] 238; GFX9-NEXT: v_mov_b32_e32 v2, v0 239; GFX9-NEXT: ; implicit-def: $vgpr0 240; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] 241; GFX9-NEXT: s_cbranch_execz BB1_4 242; GFX9-NEXT: ; %bb.1: 243; GFX9-NEXT: s_or_saveexec_b64 s[10:11], -1 244; GFX9-NEXT: v_mov_b32_e32 v1, 0 245; GFX9-NEXT: s_mov_b64 exec, s[10:11] 246; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 247; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 248; GFX9-NEXT: s_not_b64 exec, exec 249; GFX9-NEXT: v_mov_b32_e32 v2, 0 250; GFX9-NEXT: s_not_b64 exec, exec 251; GFX9-NEXT: s_or_saveexec_b64 s[10:11], -1 252; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 253; GFX9-NEXT: s_nop 1 254; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 255; GFX9-NEXT: s_nop 1 256; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 257; GFX9-NEXT: s_nop 1 258; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 259; GFX9-NEXT: s_nop 1 260; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 261; GFX9-NEXT: s_nop 1 262; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 263; GFX9-NEXT: v_readlane_b32 s12, v2, 63 264; GFX9-NEXT: s_nop 0 265; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 266; GFX9-NEXT: s_mov_b64 exec, s[10:11] 267; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 268; GFX9-NEXT: ; implicit-def: $vgpr0 269; GFX9-NEXT: s_and_saveexec_b64 s[10:11], vcc 270; GFX9-NEXT: s_cbranch_execz BB1_3 271; GFX9-NEXT: ; %bb.2: 272; GFX9-NEXT: v_mov_b32_e32 v0, s12 273; GFX9-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc 274; GFX9-NEXT: BB1_3: 275; GFX9-NEXT: s_or_b64 exec, exec, s[10:11] 276; GFX9-NEXT: s_waitcnt vmcnt(0) 277; GFX9-NEXT: v_readfirstlane_b32 s4, v0 278; GFX9-NEXT: v_mov_b32_e32 v0, v1 279; GFX9-NEXT: v_add_u32_e32 v0, s4, v0 280; GFX9-NEXT: BB1_4: ; %Flow 281; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] 282; GFX9-NEXT: s_wqm_b64 s[4:5], -1 283; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] 284; GFX9-NEXT: s_cbranch_vccnz BB1_6 285; GFX9-NEXT: ; %bb.5: ; %if 286; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 287; GFX9-NEXT: BB1_6: ; %UnifiedReturnBlock 288; GFX9-NEXT: s_endpgm 289; 290; GFX1064-LABEL: add_i32_varying: 291; GFX1064: ; %bb.0: ; %entry 292; GFX1064-NEXT: s_mov_b64 s[8:9], exec 293; GFX1064-NEXT: v_mov_b32_e32 v1, v0 294; GFX1064-NEXT: s_mov_b64 s[10:11], s[8:9] 295; GFX1064-NEXT: ; implicit-def: $vgpr0 296; GFX1064-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] 297; GFX1064-NEXT: s_cbranch_execz BB1_4 298; GFX1064-NEXT: ; %bb.1: 299; GFX1064-NEXT: s_not_b64 exec, exec 300; GFX1064-NEXT: v_mov_b32_e32 v1, 0 301; GFX1064-NEXT: s_not_b64 exec, exec 302; GFX1064-NEXT: s_or_saveexec_b64 s[10:11], -1 303; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 304; GFX1064-NEXT: v_mov_b32_e32 v3, 0 305; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 306; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 307; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 308; GFX1064-NEXT: v_mov_b32_e32 v2, v1 309; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 310; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 311; GFX1064-NEXT: v_readlane_b32 s12, v1, 31 312; GFX1064-NEXT: v_mov_b32_e32 v2, s12 313; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 314; GFX1064-NEXT: v_readlane_b32 s12, v1, 15 315; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 316; GFX1064-NEXT: v_readlane_b32 s13, v1, 31 317; GFX1064-NEXT: v_writelane_b32 v3, s12, 16 318; GFX1064-NEXT: s_mov_b64 exec, s[10:11] 319; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 320; GFX1064-NEXT: s_or_saveexec_b64 s[10:11], -1 321; GFX1064-NEXT: v_readlane_b32 s12, v1, 63 322; GFX1064-NEXT: v_readlane_b32 s14, v1, 47 323; GFX1064-NEXT: v_writelane_b32 v3, s13, 32 324; GFX1064-NEXT: s_mov_b64 exec, s[10:11] 325; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 326; GFX1064-NEXT: s_or_saveexec_b64 s[10:11], -1 327; GFX1064-NEXT: v_writelane_b32 v3, s14, 48 328; GFX1064-NEXT: s_mov_b64 exec, s[10:11] 329; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 330; GFX1064-NEXT: ; implicit-def: $vgpr0 331; GFX1064-NEXT: s_and_saveexec_b64 s[10:11], vcc 332; GFX1064-NEXT: s_cbranch_execz BB1_3 333; GFX1064-NEXT: ; %bb.2: 334; GFX1064-NEXT: v_mov_b32_e32 v0, s12 335; GFX1064-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc 336; GFX1064-NEXT: BB1_3: 337; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 338; GFX1064-NEXT: s_or_b64 exec, exec, s[10:11] 339; GFX1064-NEXT: s_waitcnt vmcnt(0) 340; GFX1064-NEXT: v_readfirstlane_b32 s4, v0 341; GFX1064-NEXT: v_mov_b32_e32 v0, v3 342; GFX1064-NEXT: v_add_nc_u32_e32 v0, s4, v0 343; GFX1064-NEXT: BB1_4: ; %Flow 344; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9] 345; GFX1064-NEXT: s_wqm_b64 s[4:5], -1 346; GFX1064-NEXT: s_andn2_b64 vcc, exec, s[4:5] 347; GFX1064-NEXT: s_cbranch_vccnz BB1_6 348; GFX1064-NEXT: ; %bb.5: ; %if 349; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 350; GFX1064-NEXT: BB1_6: ; %UnifiedReturnBlock 351; GFX1064-NEXT: s_endpgm 352; 353; GFX1032-LABEL: add_i32_varying: 354; GFX1032: ; %bb.0: ; %entry 355; GFX1032-NEXT: s_mov_b32 s8, exec_lo 356; GFX1032-NEXT: v_mov_b32_e32 v1, v0 357; GFX1032-NEXT: s_mov_b32 s9, s8 358; GFX1032-NEXT: ; implicit-def: $vgpr0 359; GFX1032-NEXT: s_and_saveexec_b32 s8, s9 360; GFX1032-NEXT: s_cbranch_execz BB1_4 361; GFX1032-NEXT: ; %bb.1: 362; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 363; GFX1032-NEXT: v_mov_b32_e32 v1, 0 364; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 365; GFX1032-NEXT: s_or_saveexec_b32 s9, -1 366; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 367; GFX1032-NEXT: v_mov_b32_e32 v3, 0 368; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 369; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 370; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 371; GFX1032-NEXT: v_mov_b32_e32 v2, v1 372; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 373; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 374; GFX1032-NEXT: v_readlane_b32 s11, v1, 31 375; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 376; GFX1032-NEXT: v_readlane_b32 s10, v1, 15 377; GFX1032-NEXT: s_mov_b32 exec_lo, s9 378; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 379; GFX1032-NEXT: s_or_saveexec_b32 s9, -1 380; GFX1032-NEXT: v_writelane_b32 v3, s10, 16 381; GFX1032-NEXT: s_mov_b32 exec_lo, s9 382; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 383; GFX1032-NEXT: ; implicit-def: $vgpr0 384; GFX1032-NEXT: s_and_saveexec_b32 s9, vcc_lo 385; GFX1032-NEXT: s_cbranch_execz BB1_3 386; GFX1032-NEXT: ; %bb.2: 387; GFX1032-NEXT: v_mov_b32_e32 v0, s11 388; GFX1032-NEXT: s_mov_b32 s10, s11 389; GFX1032-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc 390; GFX1032-NEXT: BB1_3: 391; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 392; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s9 393; GFX1032-NEXT: s_waitcnt vmcnt(0) 394; GFX1032-NEXT: v_readfirstlane_b32 s4, v0 395; GFX1032-NEXT: v_mov_b32_e32 v0, v3 396; GFX1032-NEXT: v_add_nc_u32_e32 v0, s4, v0 397; GFX1032-NEXT: BB1_4: ; %Flow 398; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s8 399; GFX1032-NEXT: s_wqm_b32 s4, -1 400; GFX1032-NEXT: s_andn2_b32 vcc_lo, exec_lo, s4 401; GFX1032-NEXT: s_cbranch_vccnz BB1_6 402; GFX1032-NEXT: ; %bb.5: ; %if 403; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 404; GFX1032-NEXT: BB1_6: ; %UnifiedReturnBlock 405; GFX1032-NEXT: s_endpgm 406entry: 407 %cond1 = call i1 @llvm.amdgcn.wqm.vote(i1 true) 408 %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 %val, <4 x i32> %inout, i32 0, i32 0, i32 0) 409 %cond2 = call i1 @llvm.amdgcn.wqm.vote(i1 true) 410 %cond = and i1 %cond1, %cond2 411 br i1 %cond, label %if, label %else 412if: 413 %bitcast = bitcast i32 %old to float 414 call void @llvm.amdgcn.raw.buffer.store.f32(float %bitcast, <4 x i32> %out, i32 0, i32 0, i32 0) 415 ret void 416else: 417 ret void 418} 419