1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck --check-prefix=GFX7 %s 3; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck --check-prefix=GFX8 %s 4; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10_W32 %s 5; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck --check-prefix=GFX10_W64 %s 6 7define float @v_div_fmas_f32(float %a, float %b, float %c, i1 %d) { 8; GFX7-LABEL: v_div_fmas_f32: 9; GFX7: ; %bb.0: 10; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11; GFX7-NEXT: v_and_b32_e32 v3, 1, v3 12; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 13; GFX7-NEXT: s_nop 3 14; GFX7-NEXT: v_div_fmas_f32 v0, v0, v1, v2 15; GFX7-NEXT: s_setpc_b64 s[30:31] 16; 17; GFX8-LABEL: v_div_fmas_f32: 18; GFX8: ; %bb.0: 19; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 20; GFX8-NEXT: v_and_b32_e32 v3, 1, v3 21; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 22; GFX8-NEXT: s_nop 3 23; GFX8-NEXT: v_div_fmas_f32 v0, v0, v1, v2 24; GFX8-NEXT: s_setpc_b64 s[30:31] 25; 26; GFX10_W32-LABEL: v_div_fmas_f32: 27; GFX10_W32: ; %bb.0: 28; GFX10_W32-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 29; GFX10_W32-NEXT: s_waitcnt_vscnt null, 0x0 30; GFX10_W32-NEXT: v_and_b32_e32 v3, 1, v3 31; GFX10_W32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3 32; GFX10_W32-NEXT: v_div_fmas_f32 v0, v0, v1, v2 33; GFX10_W32-NEXT: s_setpc_b64 s[30:31] 34; 35; GFX10_W64-LABEL: v_div_fmas_f32: 36; GFX10_W64: ; %bb.0: 37; GFX10_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 38; GFX10_W64-NEXT: s_waitcnt_vscnt null, 0x0 39; GFX10_W64-NEXT: v_and_b32_e32 v3, 1, v3 40; GFX10_W64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 41; GFX10_W64-NEXT: v_div_fmas_f32 v0, v0, v1, v2 42; GFX10_W64-NEXT: s_setpc_b64 s[30:31] 43 %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %d) 44 ret float %result 45} 46 47define double @v_div_fmas_f64(double %a, double %b, double %c, i1 %d) { 48; GFX7-LABEL: v_div_fmas_f64: 49; GFX7: ; %bb.0: 50; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 51; GFX7-NEXT: v_and_b32_e32 v6, 1, v6 52; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 53; GFX7-NEXT: s_nop 3 54; GFX7-NEXT: v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5] 55; GFX7-NEXT: s_setpc_b64 s[30:31] 56; 57; GFX8-LABEL: v_div_fmas_f64: 58; GFX8: ; %bb.0: 59; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 60; GFX8-NEXT: v_and_b32_e32 v6, 1, v6 61; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 62; GFX8-NEXT: s_nop 3 63; GFX8-NEXT: v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5] 64; GFX8-NEXT: s_setpc_b64 s[30:31] 65; 66; GFX10_W32-LABEL: v_div_fmas_f64: 67; GFX10_W32: ; %bb.0: 68; GFX10_W32-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 69; GFX10_W32-NEXT: s_waitcnt_vscnt null, 0x0 70; GFX10_W32-NEXT: v_and_b32_e32 v6, 1, v6 71; GFX10_W32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 72; GFX10_W32-NEXT: v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5] 73; GFX10_W32-NEXT: s_setpc_b64 s[30:31] 74; 75; GFX10_W64-LABEL: v_div_fmas_f64: 76; GFX10_W64: ; %bb.0: 77; GFX10_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 78; GFX10_W64-NEXT: s_waitcnt_vscnt null, 0x0 79; GFX10_W64-NEXT: v_and_b32_e32 v6, 1, v6 80; GFX10_W64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 81; GFX10_W64-NEXT: v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5] 82; GFX10_W64-NEXT: s_setpc_b64 s[30:31] 83 %result = call double @llvm.amdgcn.div.fmas.f64(double %a, double %b, double %c, i1 %d) 84 ret double %result 85} 86 87define amdgpu_ps float @s_div_fmas_f32(float inreg %a, float inreg %b, float inreg %c, i32 inreg %d) { 88; GFX7-LABEL: s_div_fmas_f32: 89; GFX7: ; %bb.0: 90; GFX7-NEXT: s_cmp_eq_u32 s3, 0 91; GFX7-NEXT: s_cselect_b32 s3, 1, 0 92; GFX7-NEXT: v_mov_b32_e32 v0, s0 93; GFX7-NEXT: s_and_b32 s0, 1, s3 94; GFX7-NEXT: v_mov_b32_e32 v1, s1 95; GFX7-NEXT: v_mov_b32_e32 v2, s2 96; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 97; GFX7-NEXT: s_nop 3 98; GFX7-NEXT: v_div_fmas_f32 v0, v0, v1, v2 99; GFX7-NEXT: ; return to shader part epilog 100; 101; GFX8-LABEL: s_div_fmas_f32: 102; GFX8: ; %bb.0: 103; GFX8-NEXT: s_cmp_eq_u32 s3, 0 104; GFX8-NEXT: s_cselect_b32 s3, 1, 0 105; GFX8-NEXT: v_mov_b32_e32 v0, s0 106; GFX8-NEXT: s_and_b32 s0, 1, s3 107; GFX8-NEXT: v_mov_b32_e32 v1, s1 108; GFX8-NEXT: v_mov_b32_e32 v2, s2 109; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 110; GFX8-NEXT: s_nop 3 111; GFX8-NEXT: v_div_fmas_f32 v0, v0, v1, v2 112; GFX8-NEXT: ; return to shader part epilog 113; 114; GFX10_W32-LABEL: s_div_fmas_f32: 115; GFX10_W32: ; %bb.0: 116; GFX10_W32-NEXT: s_cmp_eq_u32 s3, 0 117; GFX10_W32-NEXT: v_mov_b32_e32 v0, s1 118; GFX10_W32-NEXT: v_mov_b32_e32 v1, s2 119; GFX10_W32-NEXT: s_cselect_b32 s3, 1, 0 120; GFX10_W32-NEXT: s_and_b32 s3, 1, s3 121; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s3 122; GFX10_W32-NEXT: v_div_fmas_f32 v0, s0, v0, v1 123; GFX10_W32-NEXT: ; return to shader part epilog 124; 125; GFX10_W64-LABEL: s_div_fmas_f32: 126; GFX10_W64: ; %bb.0: 127; GFX10_W64-NEXT: s_cmp_eq_u32 s3, 0 128; GFX10_W64-NEXT: v_mov_b32_e32 v0, s1 129; GFX10_W64-NEXT: s_cselect_b32 s3, 1, 0 130; GFX10_W64-NEXT: v_mov_b32_e32 v1, s2 131; GFX10_W64-NEXT: s_and_b32 s3, 1, s3 132; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s3 133; GFX10_W64-NEXT: v_div_fmas_f32 v0, s0, v0, v1 134; GFX10_W64-NEXT: ; return to shader part epilog 135 %vcc = icmp eq i32 %d, 0 136 %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %vcc) 137 ret float %result 138} 139 140define amdgpu_ps double @s_div_fmas_f64(double inreg %a, double inreg %b, double inreg %c, i32 inreg %d) { 141; GFX7-LABEL: s_div_fmas_f64: 142; GFX7: ; %bb.0: 143; GFX7-NEXT: s_cmp_eq_u32 s6, 0 144; GFX7-NEXT: s_cselect_b32 s6, 1, 0 145; GFX7-NEXT: v_mov_b32_e32 v0, s0 146; GFX7-NEXT: v_mov_b32_e32 v1, s1 147; GFX7-NEXT: v_mov_b32_e32 v2, s2 148; GFX7-NEXT: v_mov_b32_e32 v4, s4 149; GFX7-NEXT: s_and_b32 s0, 1, s6 150; GFX7-NEXT: v_mov_b32_e32 v3, s3 151; GFX7-NEXT: v_mov_b32_e32 v5, s5 152; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 153; GFX7-NEXT: s_nop 3 154; GFX7-NEXT: v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5] 155; GFX7-NEXT: v_readfirstlane_b32 s0, v0 156; GFX7-NEXT: v_readfirstlane_b32 s1, v1 157; GFX7-NEXT: ; return to shader part epilog 158; 159; GFX8-LABEL: s_div_fmas_f64: 160; GFX8: ; %bb.0: 161; GFX8-NEXT: s_cmp_eq_u32 s6, 0 162; GFX8-NEXT: s_cselect_b32 s6, 1, 0 163; GFX8-NEXT: v_mov_b32_e32 v0, s0 164; GFX8-NEXT: v_mov_b32_e32 v1, s1 165; GFX8-NEXT: v_mov_b32_e32 v2, s2 166; GFX8-NEXT: v_mov_b32_e32 v4, s4 167; GFX8-NEXT: s_and_b32 s0, 1, s6 168; GFX8-NEXT: v_mov_b32_e32 v3, s3 169; GFX8-NEXT: v_mov_b32_e32 v5, s5 170; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 171; GFX8-NEXT: s_nop 3 172; GFX8-NEXT: v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5] 173; GFX8-NEXT: v_readfirstlane_b32 s0, v0 174; GFX8-NEXT: v_readfirstlane_b32 s1, v1 175; GFX8-NEXT: ; return to shader part epilog 176; 177; GFX10_W32-LABEL: s_div_fmas_f64: 178; GFX10_W32: ; %bb.0: 179; GFX10_W32-NEXT: s_cmp_eq_u32 s6, 0 180; GFX10_W32-NEXT: v_mov_b32_e32 v0, s2 181; GFX10_W32-NEXT: v_mov_b32_e32 v2, s4 182; GFX10_W32-NEXT: v_mov_b32_e32 v1, s3 183; GFX10_W32-NEXT: v_mov_b32_e32 v3, s5 184; GFX10_W32-NEXT: s_cselect_b32 s6, 1, 0 185; GFX10_W32-NEXT: s_and_b32 s6, 1, s6 186; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s6 187; GFX10_W32-NEXT: v_div_fmas_f64 v[0:1], s[0:1], v[0:1], v[2:3] 188; GFX10_W32-NEXT: v_readfirstlane_b32 s0, v0 189; GFX10_W32-NEXT: v_readfirstlane_b32 s1, v1 190; GFX10_W32-NEXT: ; return to shader part epilog 191; 192; GFX10_W64-LABEL: s_div_fmas_f64: 193; GFX10_W64: ; %bb.0: 194; GFX10_W64-NEXT: s_cmp_eq_u32 s6, 0 195; GFX10_W64-NEXT: v_mov_b32_e32 v0, s2 196; GFX10_W64-NEXT: s_cselect_b32 s6, 1, 0 197; GFX10_W64-NEXT: v_mov_b32_e32 v2, s4 198; GFX10_W64-NEXT: s_and_b32 s6, 1, s6 199; GFX10_W64-NEXT: v_mov_b32_e32 v1, s3 200; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s6 201; GFX10_W64-NEXT: v_mov_b32_e32 v3, s5 202; GFX10_W64-NEXT: v_div_fmas_f64 v[0:1], s[0:1], v[0:1], v[2:3] 203; GFX10_W64-NEXT: v_readfirstlane_b32 s0, v0 204; GFX10_W64-NEXT: v_readfirstlane_b32 s1, v1 205; GFX10_W64-NEXT: ; return to shader part epilog 206 %vcc = icmp eq i32 %d, 0 207 %result = call double @llvm.amdgcn.div.fmas.f64(double %a, double %b, double %c, i1 %vcc) 208 ret double %result 209} 210 211define amdgpu_kernel void @test_div_fmas_f32(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c, [8 x i32], i1 %d) { 212; GFX7-LABEL: test_div_fmas_f32: 213; GFX7: ; %bb.0: 214; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 215; GFX7-NEXT: s_load_dword s2, s[0:1], 0x13 216; GFX7-NEXT: s_load_dword s3, s[0:1], 0x1c 217; GFX7-NEXT: s_load_dword s6, s[0:1], 0x25 218; GFX7-NEXT: s_load_dword s0, s[0:1], 0x2e 219; GFX7-NEXT: s_mov_b32 s7, 0xf000 220; GFX7-NEXT: s_waitcnt lgkmcnt(0) 221; GFX7-NEXT: v_mov_b32_e32 v0, s2 222; GFX7-NEXT: v_mov_b32_e32 v1, s3 223; GFX7-NEXT: v_mov_b32_e32 v2, s6 224; GFX7-NEXT: s_and_b32 s0, 1, s0 225; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 226; GFX7-NEXT: s_mov_b32 s6, -1 227; GFX7-NEXT: s_nop 2 228; GFX7-NEXT: v_div_fmas_f32 v0, v0, v1, v2 229; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 230; GFX7-NEXT: s_endpgm 231; 232; GFX8-LABEL: test_div_fmas_f32: 233; GFX8: ; %bb.0: 234; GFX8-NEXT: s_load_dword s2, s[0:1], 0x4c 235; GFX8-NEXT: s_load_dword s3, s[0:1], 0x70 236; GFX8-NEXT: s_load_dword s4, s[0:1], 0x94 237; GFX8-NEXT: s_load_dword s5, s[0:1], 0xb8 238; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 239; GFX8-NEXT: s_waitcnt lgkmcnt(0) 240; GFX8-NEXT: v_mov_b32_e32 v0, s2 241; GFX8-NEXT: v_mov_b32_e32 v1, s3 242; GFX8-NEXT: v_mov_b32_e32 v2, s4 243; GFX8-NEXT: s_and_b32 s2, 1, s5 244; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 245; GFX8-NEXT: s_nop 3 246; GFX8-NEXT: v_div_fmas_f32 v2, v0, v1, v2 247; GFX8-NEXT: v_mov_b32_e32 v0, s0 248; GFX8-NEXT: v_mov_b32_e32 v1, s1 249; GFX8-NEXT: flat_store_dword v[0:1], v2 250; GFX8-NEXT: s_endpgm 251; 252; GFX10_W32-LABEL: test_div_fmas_f32: 253; GFX10_W32: ; %bb.0: 254; GFX10_W32-NEXT: s_clause 0x4 255; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0xb8 256; GFX10_W32-NEXT: s_load_dword s5, s[0:1], 0x70 257; GFX10_W32-NEXT: s_load_dword s6, s[0:1], 0x94 258; GFX10_W32-NEXT: s_load_dword s7, s[0:1], 0x4c 259; GFX10_W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 260; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) 261; GFX10_W32-NEXT: s_and_b32 s0, 1, s4 262; GFX10_W32-NEXT: v_mov_b32_e32 v0, s5 263; GFX10_W32-NEXT: v_mov_b32_e32 v1, s6 264; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 265; GFX10_W32-NEXT: v_div_fmas_f32 v0, s7, v0, v1 266; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0 267; GFX10_W32-NEXT: global_store_dword v1, v0, s[2:3] 268; GFX10_W32-NEXT: s_endpgm 269; 270; GFX10_W64-LABEL: test_div_fmas_f32: 271; GFX10_W64: ; %bb.0: 272; GFX10_W64-NEXT: s_clause 0x4 273; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0xb8 274; GFX10_W64-NEXT: s_load_dword s5, s[0:1], 0x70 275; GFX10_W64-NEXT: s_load_dword s6, s[0:1], 0x94 276; GFX10_W64-NEXT: s_load_dword s7, s[0:1], 0x4c 277; GFX10_W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 278; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) 279; GFX10_W64-NEXT: s_and_b32 s0, 1, s4 280; GFX10_W64-NEXT: v_mov_b32_e32 v0, s5 281; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 282; GFX10_W64-NEXT: v_mov_b32_e32 v1, s6 283; GFX10_W64-NEXT: v_div_fmas_f32 v0, s7, v0, v1 284; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0 285; GFX10_W64-NEXT: global_store_dword v1, v0, s[2:3] 286; GFX10_W64-NEXT: s_endpgm 287 %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %d) 288 store float %result, float addrspace(1)* %out, align 4 289 ret void 290} 291 292define amdgpu_kernel void @test_div_fmas_f32_inline_imm_0(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c, [8 x i32], i1 %d) { 293; GFX7-LABEL: test_div_fmas_f32_inline_imm_0: 294; GFX7: ; %bb.0: 295; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 296; GFX7-NEXT: s_load_dword s2, s[0:1], 0x1c 297; GFX7-NEXT: s_load_dword s3, s[0:1], 0x25 298; GFX7-NEXT: s_load_dword s0, s[0:1], 0x2e 299; GFX7-NEXT: s_mov_b32 s6, -1 300; GFX7-NEXT: s_mov_b32 s7, 0xf000 301; GFX7-NEXT: s_waitcnt lgkmcnt(0) 302; GFX7-NEXT: v_mov_b32_e32 v0, s2 303; GFX7-NEXT: v_mov_b32_e32 v1, s3 304; GFX7-NEXT: s_and_b32 s0, 1, s0 305; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 306; GFX7-NEXT: s_nop 3 307; GFX7-NEXT: v_div_fmas_f32 v0, 1.0, v0, v1 308; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 309; GFX7-NEXT: s_endpgm 310; 311; GFX8-LABEL: test_div_fmas_f32_inline_imm_0: 312; GFX8: ; %bb.0: 313; GFX8-NEXT: s_load_dword s2, s[0:1], 0x70 314; GFX8-NEXT: s_load_dword s3, s[0:1], 0x94 315; GFX8-NEXT: s_load_dword s4, s[0:1], 0xb8 316; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 317; GFX8-NEXT: s_waitcnt lgkmcnt(0) 318; GFX8-NEXT: v_mov_b32_e32 v0, s2 319; GFX8-NEXT: v_mov_b32_e32 v1, s3 320; GFX8-NEXT: s_and_b32 s2, 1, s4 321; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 322; GFX8-NEXT: s_nop 3 323; GFX8-NEXT: v_div_fmas_f32 v2, 1.0, v0, v1 324; GFX8-NEXT: v_mov_b32_e32 v0, s0 325; GFX8-NEXT: v_mov_b32_e32 v1, s1 326; GFX8-NEXT: flat_store_dword v[0:1], v2 327; GFX8-NEXT: s_endpgm 328; 329; GFX10_W32-LABEL: test_div_fmas_f32_inline_imm_0: 330; GFX10_W32: ; %bb.0: 331; GFX10_W32-NEXT: s_clause 0x3 332; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0xb8 333; GFX10_W32-NEXT: s_load_dword s5, s[0:1], 0x94 334; GFX10_W32-NEXT: s_load_dword s6, s[0:1], 0x70 335; GFX10_W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 336; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0 337; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) 338; GFX10_W32-NEXT: s_and_b32 s0, 1, s4 339; GFX10_W32-NEXT: v_mov_b32_e32 v0, s5 340; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 341; GFX10_W32-NEXT: v_div_fmas_f32 v0, 1.0, s6, v0 342; GFX10_W32-NEXT: global_store_dword v1, v0, s[2:3] 343; GFX10_W32-NEXT: s_endpgm 344; 345; GFX10_W64-LABEL: test_div_fmas_f32_inline_imm_0: 346; GFX10_W64: ; %bb.0: 347; GFX10_W64-NEXT: s_clause 0x3 348; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0xb8 349; GFX10_W64-NEXT: s_load_dword s5, s[0:1], 0x94 350; GFX10_W64-NEXT: s_load_dword s6, s[0:1], 0x70 351; GFX10_W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 352; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0 353; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) 354; GFX10_W64-NEXT: s_and_b32 s0, 1, s4 355; GFX10_W64-NEXT: v_mov_b32_e32 v0, s5 356; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 357; GFX10_W64-NEXT: v_div_fmas_f32 v0, 1.0, s6, v0 358; GFX10_W64-NEXT: global_store_dword v1, v0, s[2:3] 359; GFX10_W64-NEXT: s_endpgm 360 %result = call float @llvm.amdgcn.div.fmas.f32(float 1.0, float %b, float %c, i1 %d) 361 store float %result, float addrspace(1)* %out, align 4 362 ret void 363} 364 365define amdgpu_kernel void @test_div_fmas_f32_inline_imm_1(float addrspace(1)* %out, float %a, float %b, float %c, [8 x i32], i1 %d) { 366; GFX7-LABEL: test_div_fmas_f32_inline_imm_1: 367; GFX7: ; %bb.0: 368; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 369; GFX7-NEXT: s_load_dword s2, s[0:1], 0xb 370; GFX7-NEXT: s_load_dword s3, s[0:1], 0xd 371; GFX7-NEXT: s_load_dword s0, s[0:1], 0x16 372; GFX7-NEXT: s_mov_b32 s6, -1 373; GFX7-NEXT: s_mov_b32 s7, 0xf000 374; GFX7-NEXT: s_waitcnt lgkmcnt(0) 375; GFX7-NEXT: v_mov_b32_e32 v0, s2 376; GFX7-NEXT: v_mov_b32_e32 v1, s3 377; GFX7-NEXT: s_and_b32 s0, 1, s0 378; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 379; GFX7-NEXT: s_nop 3 380; GFX7-NEXT: v_div_fmas_f32 v0, v0, 1.0, v1 381; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 382; GFX7-NEXT: s_endpgm 383; 384; GFX8-LABEL: test_div_fmas_f32_inline_imm_1: 385; GFX8: ; %bb.0: 386; GFX8-NEXT: s_load_dword s2, s[0:1], 0x2c 387; GFX8-NEXT: s_load_dword s3, s[0:1], 0x34 388; GFX8-NEXT: s_load_dword s4, s[0:1], 0x58 389; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 390; GFX8-NEXT: s_waitcnt lgkmcnt(0) 391; GFX8-NEXT: v_mov_b32_e32 v0, s2 392; GFX8-NEXT: v_mov_b32_e32 v1, s3 393; GFX8-NEXT: s_and_b32 s2, 1, s4 394; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 395; GFX8-NEXT: s_nop 3 396; GFX8-NEXT: v_div_fmas_f32 v2, v0, 1.0, v1 397; GFX8-NEXT: v_mov_b32_e32 v0, s0 398; GFX8-NEXT: v_mov_b32_e32 v1, s1 399; GFX8-NEXT: flat_store_dword v[0:1], v2 400; GFX8-NEXT: s_endpgm 401; 402; GFX10_W32-LABEL: test_div_fmas_f32_inline_imm_1: 403; GFX10_W32: ; %bb.0: 404; GFX10_W32-NEXT: s_clause 0x3 405; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x58 406; GFX10_W32-NEXT: s_load_dword s5, s[0:1], 0x34 407; GFX10_W32-NEXT: s_load_dword s6, s[0:1], 0x2c 408; GFX10_W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 409; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0 410; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) 411; GFX10_W32-NEXT: s_and_b32 s0, 1, s4 412; GFX10_W32-NEXT: v_mov_b32_e32 v0, s5 413; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 414; GFX10_W32-NEXT: v_div_fmas_f32 v0, s6, 1.0, v0 415; GFX10_W32-NEXT: global_store_dword v1, v0, s[2:3] 416; GFX10_W32-NEXT: s_endpgm 417; 418; GFX10_W64-LABEL: test_div_fmas_f32_inline_imm_1: 419; GFX10_W64: ; %bb.0: 420; GFX10_W64-NEXT: s_clause 0x3 421; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x58 422; GFX10_W64-NEXT: s_load_dword s5, s[0:1], 0x34 423; GFX10_W64-NEXT: s_load_dword s6, s[0:1], 0x2c 424; GFX10_W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 425; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0 426; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) 427; GFX10_W64-NEXT: s_and_b32 s0, 1, s4 428; GFX10_W64-NEXT: v_mov_b32_e32 v0, s5 429; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 430; GFX10_W64-NEXT: v_div_fmas_f32 v0, s6, 1.0, v0 431; GFX10_W64-NEXT: global_store_dword v1, v0, s[2:3] 432; GFX10_W64-NEXT: s_endpgm 433 %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float 1.0, float %c, i1 %d) 434 store float %result, float addrspace(1)* %out, align 4 435 ret void 436} 437 438define amdgpu_kernel void @test_div_fmas_f32_inline_imm_2(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c, [8 x i32], i1 %d) { 439; GFX7-LABEL: test_div_fmas_f32_inline_imm_2: 440; GFX7: ; %bb.0: 441; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 442; GFX7-NEXT: s_load_dword s2, s[0:1], 0x13 443; GFX7-NEXT: s_load_dword s3, s[0:1], 0x1c 444; GFX7-NEXT: s_load_dword s0, s[0:1], 0x2e 445; GFX7-NEXT: s_mov_b32 s6, -1 446; GFX7-NEXT: s_mov_b32 s7, 0xf000 447; GFX7-NEXT: s_waitcnt lgkmcnt(0) 448; GFX7-NEXT: v_mov_b32_e32 v0, s2 449; GFX7-NEXT: v_mov_b32_e32 v1, s3 450; GFX7-NEXT: s_and_b32 s0, 1, s0 451; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 452; GFX7-NEXT: s_nop 3 453; GFX7-NEXT: v_div_fmas_f32 v0, v0, v1, 1.0 454; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 455; GFX7-NEXT: s_endpgm 456; 457; GFX8-LABEL: test_div_fmas_f32_inline_imm_2: 458; GFX8: ; %bb.0: 459; GFX8-NEXT: s_load_dword s2, s[0:1], 0x4c 460; GFX8-NEXT: s_load_dword s3, s[0:1], 0x70 461; GFX8-NEXT: s_load_dword s4, s[0:1], 0xb8 462; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 463; GFX8-NEXT: s_waitcnt lgkmcnt(0) 464; GFX8-NEXT: v_mov_b32_e32 v0, s2 465; GFX8-NEXT: v_mov_b32_e32 v1, s3 466; GFX8-NEXT: s_and_b32 s2, 1, s4 467; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 468; GFX8-NEXT: s_nop 3 469; GFX8-NEXT: v_div_fmas_f32 v2, v0, v1, 1.0 470; GFX8-NEXT: v_mov_b32_e32 v0, s0 471; GFX8-NEXT: v_mov_b32_e32 v1, s1 472; GFX8-NEXT: flat_store_dword v[0:1], v2 473; GFX8-NEXT: s_endpgm 474; 475; GFX10_W32-LABEL: test_div_fmas_f32_inline_imm_2: 476; GFX10_W32: ; %bb.0: 477; GFX10_W32-NEXT: s_clause 0x3 478; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0xb8 479; GFX10_W32-NEXT: s_load_dword s5, s[0:1], 0x70 480; GFX10_W32-NEXT: s_load_dword s6, s[0:1], 0x4c 481; GFX10_W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 482; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0 483; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) 484; GFX10_W32-NEXT: s_and_b32 s0, 1, s4 485; GFX10_W32-NEXT: v_mov_b32_e32 v0, s5 486; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 487; GFX10_W32-NEXT: v_div_fmas_f32 v0, s6, v0, 1.0 488; GFX10_W32-NEXT: global_store_dword v1, v0, s[2:3] 489; GFX10_W32-NEXT: s_endpgm 490; 491; GFX10_W64-LABEL: test_div_fmas_f32_inline_imm_2: 492; GFX10_W64: ; %bb.0: 493; GFX10_W64-NEXT: s_clause 0x3 494; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0xb8 495; GFX10_W64-NEXT: s_load_dword s5, s[0:1], 0x70 496; GFX10_W64-NEXT: s_load_dword s6, s[0:1], 0x4c 497; GFX10_W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 498; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0 499; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) 500; GFX10_W64-NEXT: s_and_b32 s0, 1, s4 501; GFX10_W64-NEXT: v_mov_b32_e32 v0, s5 502; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 503; GFX10_W64-NEXT: v_div_fmas_f32 v0, s6, v0, 1.0 504; GFX10_W64-NEXT: global_store_dword v1, v0, s[2:3] 505; GFX10_W64-NEXT: s_endpgm 506 %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float 1.0, i1 %d) 507 store float %result, float addrspace(1)* %out, align 4 508 ret void 509} 510 511define amdgpu_kernel void @test_div_fmas_f64(double addrspace(1)* %out, double %a, double %b, double %c, i1 %d) { 512; GFX7-LABEL: test_div_fmas_f64: 513; GFX7: ; %bb.0: 514; GFX7-NEXT: s_load_dword s8, s[0:1], 0x11 515; GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 516; GFX7-NEXT: s_waitcnt lgkmcnt(0) 517; GFX7-NEXT: v_mov_b32_e32 v0, s2 518; GFX7-NEXT: v_mov_b32_e32 v1, s3 519; GFX7-NEXT: v_mov_b32_e32 v2, s4 520; GFX7-NEXT: v_mov_b32_e32 v4, s6 521; GFX7-NEXT: s_and_b32 s2, 1, s8 522; GFX7-NEXT: v_mov_b32_e32 v3, s5 523; GFX7-NEXT: v_mov_b32_e32 v5, s7 524; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 525; GFX7-NEXT: s_nop 3 526; GFX7-NEXT: v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5] 527; GFX7-NEXT: v_mov_b32_e32 v3, s1 528; GFX7-NEXT: v_mov_b32_e32 v2, s0 529; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 530; GFX7-NEXT: s_endpgm 531; 532; GFX8-LABEL: test_div_fmas_f64: 533; GFX8: ; %bb.0: 534; GFX8-NEXT: s_load_dword s8, s[0:1], 0x44 535; GFX8-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 536; GFX8-NEXT: s_waitcnt lgkmcnt(0) 537; GFX8-NEXT: v_mov_b32_e32 v0, s2 538; GFX8-NEXT: v_mov_b32_e32 v1, s3 539; GFX8-NEXT: v_mov_b32_e32 v2, s4 540; GFX8-NEXT: v_mov_b32_e32 v4, s6 541; GFX8-NEXT: s_and_b32 s2, 1, s8 542; GFX8-NEXT: v_mov_b32_e32 v3, s5 543; GFX8-NEXT: v_mov_b32_e32 v5, s7 544; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 545; GFX8-NEXT: s_nop 3 546; GFX8-NEXT: v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5] 547; GFX8-NEXT: v_mov_b32_e32 v3, s1 548; GFX8-NEXT: v_mov_b32_e32 v2, s0 549; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 550; GFX8-NEXT: s_endpgm 551; 552; GFX10_W32-LABEL: test_div_fmas_f64: 553; GFX10_W32: ; %bb.0: 554; GFX10_W32-NEXT: s_clause 0x1 555; GFX10_W32-NEXT: s_load_dword s2, s[0:1], 0x44 556; GFX10_W32-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 557; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) 558; GFX10_W32-NEXT: s_and_b32 s0, 1, s2 559; GFX10_W32-NEXT: v_mov_b32_e32 v0, s8 560; GFX10_W32-NEXT: v_mov_b32_e32 v2, s10 561; GFX10_W32-NEXT: v_mov_b32_e32 v1, s9 562; GFX10_W32-NEXT: v_mov_b32_e32 v3, s11 563; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 564; GFX10_W32-NEXT: v_div_fmas_f64 v[0:1], s[6:7], v[0:1], v[2:3] 565; GFX10_W32-NEXT: v_mov_b32_e32 v2, 0 566; GFX10_W32-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 567; GFX10_W32-NEXT: s_endpgm 568; 569; GFX10_W64-LABEL: test_div_fmas_f64: 570; GFX10_W64: ; %bb.0: 571; GFX10_W64-NEXT: s_clause 0x1 572; GFX10_W64-NEXT: s_load_dword s2, s[0:1], 0x44 573; GFX10_W64-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 574; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) 575; GFX10_W64-NEXT: s_and_b32 s0, 1, s2 576; GFX10_W64-NEXT: v_mov_b32_e32 v0, s8 577; GFX10_W64-NEXT: v_mov_b32_e32 v2, s10 578; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 579; GFX10_W64-NEXT: v_mov_b32_e32 v1, s9 580; GFX10_W64-NEXT: v_mov_b32_e32 v3, s11 581; GFX10_W64-NEXT: v_div_fmas_f64 v[0:1], s[6:7], v[0:1], v[2:3] 582; GFX10_W64-NEXT: v_mov_b32_e32 v2, 0 583; GFX10_W64-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 584; GFX10_W64-NEXT: s_endpgm 585 %result = call double @llvm.amdgcn.div.fmas.f64(double %a, double %b, double %c, i1 %d) 586 store double %result, double addrspace(1)* %out, align 8 587 ret void 588} 589 590define amdgpu_kernel void @test_div_fmas_f32_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c, i32 %i) { 591; GFX7-LABEL: test_div_fmas_f32_cond_to_vcc: 592; GFX7: ; %bb.0: 593; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 594; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb 595; GFX7-NEXT: s_mov_b32 s6, -1 596; GFX7-NEXT: s_mov_b32 s7, 0xf000 597; GFX7-NEXT: s_waitcnt lgkmcnt(0) 598; GFX7-NEXT: s_cmp_eq_u32 s3, 0 599; GFX7-NEXT: s_cselect_b32 s3, 1, 0 600; GFX7-NEXT: v_mov_b32_e32 v0, s0 601; GFX7-NEXT: s_and_b32 s0, 1, s3 602; GFX7-NEXT: v_mov_b32_e32 v1, s1 603; GFX7-NEXT: v_mov_b32_e32 v2, s2 604; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 605; GFX7-NEXT: s_nop 3 606; GFX7-NEXT: v_div_fmas_f32 v0, v0, v1, v2 607; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 608; GFX7-NEXT: s_endpgm 609; 610; GFX8-LABEL: test_div_fmas_f32_cond_to_vcc: 611; GFX8: ; %bb.0: 612; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 613; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 614; GFX8-NEXT: s_waitcnt lgkmcnt(0) 615; GFX8-NEXT: s_cmp_eq_u32 s7, 0 616; GFX8-NEXT: s_cselect_b32 s2, 1, 0 617; GFX8-NEXT: s_and_b32 s2, 1, s2 618; GFX8-NEXT: v_mov_b32_e32 v0, s4 619; GFX8-NEXT: v_mov_b32_e32 v1, s5 620; GFX8-NEXT: v_mov_b32_e32 v2, s6 621; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 622; GFX8-NEXT: s_nop 3 623; GFX8-NEXT: v_div_fmas_f32 v2, v0, v1, v2 624; GFX8-NEXT: v_mov_b32_e32 v0, s0 625; GFX8-NEXT: v_mov_b32_e32 v1, s1 626; GFX8-NEXT: flat_store_dword v[0:1], v2 627; GFX8-NEXT: s_endpgm 628; 629; GFX10_W32-LABEL: test_div_fmas_f32_cond_to_vcc: 630; GFX10_W32: ; %bb.0: 631; GFX10_W32-NEXT: s_clause 0x1 632; GFX10_W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 633; GFX10_W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 634; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) 635; GFX10_W32-NEXT: s_cmp_eq_u32 s7, 0 636; GFX10_W32-NEXT: v_mov_b32_e32 v0, s5 637; GFX10_W32-NEXT: v_mov_b32_e32 v1, s6 638; GFX10_W32-NEXT: s_cselect_b32 s0, 1, 0 639; GFX10_W32-NEXT: s_and_b32 s0, 1, s0 640; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 641; GFX10_W32-NEXT: v_div_fmas_f32 v0, s4, v0, v1 642; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0 643; GFX10_W32-NEXT: global_store_dword v1, v0, s[2:3] 644; GFX10_W32-NEXT: s_endpgm 645; 646; GFX10_W64-LABEL: test_div_fmas_f32_cond_to_vcc: 647; GFX10_W64: ; %bb.0: 648; GFX10_W64-NEXT: s_clause 0x1 649; GFX10_W64-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 650; GFX10_W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 651; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) 652; GFX10_W64-NEXT: s_cmp_eq_u32 s7, 0 653; GFX10_W64-NEXT: v_mov_b32_e32 v0, s5 654; GFX10_W64-NEXT: s_cselect_b32 s0, 1, 0 655; GFX10_W64-NEXT: v_mov_b32_e32 v1, s6 656; GFX10_W64-NEXT: s_and_b32 s0, 1, s0 657; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 658; GFX10_W64-NEXT: v_div_fmas_f32 v0, s4, v0, v1 659; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0 660; GFX10_W64-NEXT: global_store_dword v1, v0, s[2:3] 661; GFX10_W64-NEXT: s_endpgm 662 %cmp = icmp eq i32 %i, 0 663 %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %cmp) 664 store float %result, float addrspace(1)* %out, align 4 665 ret void 666} 667 668define amdgpu_kernel void @test_div_fmas_f32_imm_false_cond_to_vcc(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c) { 669; GFX7-LABEL: test_div_fmas_f32_imm_false_cond_to_vcc: 670; GFX7: ; %bb.0: 671; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 672; GFX7-NEXT: s_load_dword s2, s[0:1], 0x13 673; GFX7-NEXT: s_load_dword s3, s[0:1], 0x1c 674; GFX7-NEXT: s_load_dword s0, s[0:1], 0x25 675; GFX7-NEXT: s_mov_b64 vcc, 0 676; GFX7-NEXT: s_mov_b32 s6, -1 677; GFX7-NEXT: s_waitcnt lgkmcnt(0) 678; GFX7-NEXT: v_mov_b32_e32 v0, s2 679; GFX7-NEXT: v_mov_b32_e32 v1, s3 680; GFX7-NEXT: v_mov_b32_e32 v2, s0 681; GFX7-NEXT: v_div_fmas_f32 v0, v0, v1, v2 682; GFX7-NEXT: s_mov_b32 s7, 0xf000 683; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 684; GFX7-NEXT: s_endpgm 685; 686; GFX8-LABEL: test_div_fmas_f32_imm_false_cond_to_vcc: 687; GFX8: ; %bb.0: 688; GFX8-NEXT: s_load_dword s2, s[0:1], 0x4c 689; GFX8-NEXT: s_load_dword s3, s[0:1], 0x70 690; GFX8-NEXT: s_load_dword s4, s[0:1], 0x94 691; GFX8-NEXT: s_mov_b64 vcc, 0 692; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 693; GFX8-NEXT: s_waitcnt lgkmcnt(0) 694; GFX8-NEXT: v_mov_b32_e32 v0, s2 695; GFX8-NEXT: v_mov_b32_e32 v1, s3 696; GFX8-NEXT: v_mov_b32_e32 v2, s4 697; GFX8-NEXT: v_div_fmas_f32 v2, v0, v1, v2 698; GFX8-NEXT: v_mov_b32_e32 v0, s0 699; GFX8-NEXT: v_mov_b32_e32 v1, s1 700; GFX8-NEXT: flat_store_dword v[0:1], v2 701; GFX8-NEXT: s_endpgm 702; 703; GFX10_W32-LABEL: test_div_fmas_f32_imm_false_cond_to_vcc: 704; GFX10_W32: ; %bb.0: 705; GFX10_W32-NEXT: s_clause 0x3 706; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x70 707; GFX10_W32-NEXT: s_load_dword s5, s[0:1], 0x94 708; GFX10_W32-NEXT: s_load_dword s6, s[0:1], 0x4c 709; GFX10_W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 710; GFX10_W32-NEXT: s_mov_b32 vcc_lo, 0 711; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) 712; GFX10_W32-NEXT: v_mov_b32_e32 v0, s4 713; GFX10_W32-NEXT: v_mov_b32_e32 v1, s5 714; GFX10_W32-NEXT: v_div_fmas_f32 v0, s6, v0, v1 715; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0 716; GFX10_W32-NEXT: global_store_dword v1, v0, s[2:3] 717; GFX10_W32-NEXT: s_endpgm 718; 719; GFX10_W64-LABEL: test_div_fmas_f32_imm_false_cond_to_vcc: 720; GFX10_W64: ; %bb.0: 721; GFX10_W64-NEXT: s_clause 0x3 722; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x70 723; GFX10_W64-NEXT: s_load_dword s5, s[0:1], 0x94 724; GFX10_W64-NEXT: s_load_dword s6, s[0:1], 0x4c 725; GFX10_W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 726; GFX10_W64-NEXT: s_mov_b64 vcc, 0 727; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) 728; GFX10_W64-NEXT: v_mov_b32_e32 v0, s4 729; GFX10_W64-NEXT: v_mov_b32_e32 v1, s5 730; GFX10_W64-NEXT: v_div_fmas_f32 v0, s6, v0, v1 731; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0 732; GFX10_W64-NEXT: global_store_dword v1, v0, s[2:3] 733; GFX10_W64-NEXT: s_endpgm 734 %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 false) 735 store float %result, float addrspace(1)* %out, align 4 736 ret void 737} 738 739define amdgpu_kernel void @test_div_fmas_f32_imm_true_cond_to_vcc(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c) { 740; GFX7-LABEL: test_div_fmas_f32_imm_true_cond_to_vcc: 741; GFX7: ; %bb.0: 742; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 743; GFX7-NEXT: s_load_dword s2, s[0:1], 0x13 744; GFX7-NEXT: s_load_dword s3, s[0:1], 0x1c 745; GFX7-NEXT: s_load_dword s0, s[0:1], 0x25 746; GFX7-NEXT: s_mov_b64 vcc, -1 747; GFX7-NEXT: s_mov_b32 s6, -1 748; GFX7-NEXT: s_waitcnt lgkmcnt(0) 749; GFX7-NEXT: v_mov_b32_e32 v0, s2 750; GFX7-NEXT: v_mov_b32_e32 v1, s3 751; GFX7-NEXT: v_mov_b32_e32 v2, s0 752; GFX7-NEXT: v_div_fmas_f32 v0, v0, v1, v2 753; GFX7-NEXT: s_mov_b32 s7, 0xf000 754; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 755; GFX7-NEXT: s_endpgm 756; 757; GFX8-LABEL: test_div_fmas_f32_imm_true_cond_to_vcc: 758; GFX8: ; %bb.0: 759; GFX8-NEXT: s_load_dword s2, s[0:1], 0x4c 760; GFX8-NEXT: s_load_dword s3, s[0:1], 0x70 761; GFX8-NEXT: s_load_dword s4, s[0:1], 0x94 762; GFX8-NEXT: s_mov_b64 vcc, -1 763; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 764; GFX8-NEXT: s_waitcnt lgkmcnt(0) 765; GFX8-NEXT: v_mov_b32_e32 v0, s2 766; GFX8-NEXT: v_mov_b32_e32 v1, s3 767; GFX8-NEXT: v_mov_b32_e32 v2, s4 768; GFX8-NEXT: v_div_fmas_f32 v2, v0, v1, v2 769; GFX8-NEXT: v_mov_b32_e32 v0, s0 770; GFX8-NEXT: v_mov_b32_e32 v1, s1 771; GFX8-NEXT: flat_store_dword v[0:1], v2 772; GFX8-NEXT: s_endpgm 773; 774; GFX10_W32-LABEL: test_div_fmas_f32_imm_true_cond_to_vcc: 775; GFX10_W32: ; %bb.0: 776; GFX10_W32-NEXT: s_clause 0x3 777; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x70 778; GFX10_W32-NEXT: s_load_dword s5, s[0:1], 0x94 779; GFX10_W32-NEXT: s_load_dword s6, s[0:1], 0x4c 780; GFX10_W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 781; GFX10_W32-NEXT: s_mov_b32 vcc_lo, -1 782; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) 783; GFX10_W32-NEXT: v_mov_b32_e32 v0, s4 784; GFX10_W32-NEXT: v_mov_b32_e32 v1, s5 785; GFX10_W32-NEXT: v_div_fmas_f32 v0, s6, v0, v1 786; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0 787; GFX10_W32-NEXT: global_store_dword v1, v0, s[2:3] 788; GFX10_W32-NEXT: s_endpgm 789; 790; GFX10_W64-LABEL: test_div_fmas_f32_imm_true_cond_to_vcc: 791; GFX10_W64: ; %bb.0: 792; GFX10_W64-NEXT: s_clause 0x3 793; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x70 794; GFX10_W64-NEXT: s_load_dword s5, s[0:1], 0x94 795; GFX10_W64-NEXT: s_load_dword s6, s[0:1], 0x4c 796; GFX10_W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 797; GFX10_W64-NEXT: s_mov_b64 vcc, -1 798; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) 799; GFX10_W64-NEXT: v_mov_b32_e32 v0, s4 800; GFX10_W64-NEXT: v_mov_b32_e32 v1, s5 801; GFX10_W64-NEXT: v_div_fmas_f32 v0, s6, v0, v1 802; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0 803; GFX10_W64-NEXT: global_store_dword v1, v0, s[2:3] 804; GFX10_W64-NEXT: s_endpgm 805 %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 true) 806 store float %result, float addrspace(1)* %out, align 4 807 ret void 808} 809 810define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace(1)* %out, float addrspace(1)* %in, [8 x i32], i32 %d) { 811; GFX7-LABEL: test_div_fmas_f32_logical_cond_to_vcc: 812; GFX7: ; %bb.0: 813; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 814; GFX7-NEXT: s_load_dword s8, s[0:1], 0x15 815; GFX7-NEXT: s_mov_b32 s2, 0 816; GFX7-NEXT: v_lshlrev_b32_e32 v1, 2, v0 817; GFX7-NEXT: v_mov_b32_e32 v2, 0 818; GFX7-NEXT: s_mov_b32 s3, 0xf000 819; GFX7-NEXT: s_waitcnt lgkmcnt(0) 820; GFX7-NEXT: s_mov_b64 s[0:1], s[6:7] 821; GFX7-NEXT: buffer_load_dword v3, v[1:2], s[0:3], 0 addr64 glc 822; GFX7-NEXT: s_waitcnt vmcnt(0) 823; GFX7-NEXT: buffer_load_dword v4, v[1:2], s[0:3], 0 addr64 offset:4 glc 824; GFX7-NEXT: s_waitcnt vmcnt(0) 825; GFX7-NEXT: buffer_load_dword v1, v[1:2], s[0:3], 0 addr64 offset:8 glc 826; GFX7-NEXT: s_waitcnt vmcnt(0) 827; GFX7-NEXT: s_cmp_lg_u32 s8, 0 828; GFX7-NEXT: s_cselect_b32 s0, 1, 0 829; GFX7-NEXT: s_and_b32 s0, 1, s0 830; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 831; GFX7-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 832; GFX7-NEXT: s_mov_b32 s2, -1 833; GFX7-NEXT: s_and_b64 vcc, vcc, s[0:1] 834; GFX7-NEXT: s_mov_b64 s[6:7], s[2:3] 835; GFX7-NEXT: v_div_fmas_f32 v0, v3, v4, v1 836; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:8 837; GFX7-NEXT: s_endpgm 838; 839; GFX8-LABEL: test_div_fmas_f32_logical_cond_to_vcc: 840; GFX8: ; %bb.0: 841; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 842; GFX8-NEXT: s_load_dword s2, s[0:1], 0x54 843; GFX8-NEXT: v_lshlrev_b32_e32 v3, 2, v0 844; GFX8-NEXT: s_waitcnt lgkmcnt(0) 845; GFX8-NEXT: v_mov_b32_e32 v1, s6 846; GFX8-NEXT: v_mov_b32_e32 v2, s7 847; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 848; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc 849; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v1 850; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc 851; GFX8-NEXT: v_add_u32_e32 v5, vcc, 8, v1 852; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v2, vcc 853; GFX8-NEXT: flat_load_dword v1, v[1:2] glc 854; GFX8-NEXT: s_waitcnt vmcnt(0) 855; GFX8-NEXT: flat_load_dword v2, v[3:4] glc 856; GFX8-NEXT: s_waitcnt vmcnt(0) 857; GFX8-NEXT: flat_load_dword v3, v[5:6] glc 858; GFX8-NEXT: s_waitcnt vmcnt(0) 859; GFX8-NEXT: s_add_u32 s0, s4, 8 860; GFX8-NEXT: s_addc_u32 s1, s5, 0 861; GFX8-NEXT: s_cmp_lg_u32 s2, 0 862; GFX8-NEXT: s_cselect_b32 s2, 1, 0 863; GFX8-NEXT: s_and_b32 s2, 1, s2 864; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 865; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, s2 866; GFX8-NEXT: s_and_b64 vcc, vcc, s[2:3] 867; GFX8-NEXT: s_nop 1 868; GFX8-NEXT: v_div_fmas_f32 v2, v1, v2, v3 869; GFX8-NEXT: v_mov_b32_e32 v0, s0 870; GFX8-NEXT: v_mov_b32_e32 v1, s1 871; GFX8-NEXT: flat_store_dword v[0:1], v2 872; GFX8-NEXT: s_endpgm 873; 874; GFX10_W32-LABEL: test_div_fmas_f32_logical_cond_to_vcc: 875; GFX10_W32: ; %bb.0: 876; GFX10_W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 877; GFX10_W32-NEXT: v_lshlrev_b32_e32 v1, 2, v0 878; GFX10_W32-NEXT: s_load_dword s0, s[0:1], 0x54 879; GFX10_W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 880; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) 881; GFX10_W32-NEXT: global_load_dword v2, v1, s[6:7] glc dlc 882; GFX10_W32-NEXT: s_waitcnt vmcnt(0) 883; GFX10_W32-NEXT: global_load_dword v3, v1, s[6:7] offset:4 glc dlc 884; GFX10_W32-NEXT: s_waitcnt vmcnt(0) 885; GFX10_W32-NEXT: global_load_dword v4, v1, s[6:7] offset:8 glc dlc 886; GFX10_W32-NEXT: s_waitcnt vmcnt(0) 887; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0 888; GFX10_W32-NEXT: s_cmp_lg_u32 s0, 0 889; GFX10_W32-NEXT: s_cselect_b32 s0, 1, 0 890; GFX10_W32-NEXT: s_and_b32 s0, 1, s0 891; GFX10_W32-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 892; GFX10_W32-NEXT: s_and_b32 vcc_lo, vcc_lo, s0 893; GFX10_W32-NEXT: v_div_fmas_f32 v0, v2, v3, v4 894; GFX10_W32-NEXT: global_store_dword v1, v0, s[4:5] offset:8 895; GFX10_W32-NEXT: s_endpgm 896; 897; GFX10_W64-LABEL: test_div_fmas_f32_logical_cond_to_vcc: 898; GFX10_W64: ; %bb.0: 899; GFX10_W64-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 900; GFX10_W64-NEXT: v_lshlrev_b32_e32 v1, 2, v0 901; GFX10_W64-NEXT: s_load_dword s0, s[0:1], 0x54 902; GFX10_W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 903; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) 904; GFX10_W64-NEXT: global_load_dword v2, v1, s[6:7] glc dlc 905; GFX10_W64-NEXT: s_waitcnt vmcnt(0) 906; GFX10_W64-NEXT: global_load_dword v3, v1, s[6:7] offset:4 glc dlc 907; GFX10_W64-NEXT: s_waitcnt vmcnt(0) 908; GFX10_W64-NEXT: global_load_dword v4, v1, s[6:7] offset:8 glc dlc 909; GFX10_W64-NEXT: s_waitcnt vmcnt(0) 910; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0 911; GFX10_W64-NEXT: s_cmp_lg_u32 s0, 0 912; GFX10_W64-NEXT: s_cselect_b32 s0, 1, 0 913; GFX10_W64-NEXT: s_and_b32 s0, 1, s0 914; GFX10_W64-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 915; GFX10_W64-NEXT: s_and_b64 vcc, vcc, s[0:1] 916; GFX10_W64-NEXT: v_div_fmas_f32 v0, v2, v3, v4 917; GFX10_W64-NEXT: global_store_dword v1, v0, s[4:5] offset:8 918; GFX10_W64-NEXT: s_endpgm 919 %tid = call i32 @llvm.amdgcn.workitem.id.x() 920 %gep.a = getelementptr float, float addrspace(1)* %in, i32 %tid 921 %gep.b = getelementptr float, float addrspace(1)* %gep.a, i32 1 922 %gep.c = getelementptr float, float addrspace(1)* %gep.a, i32 2 923 %gep.out = getelementptr float, float addrspace(1)* %out, i32 2 924 925 %a = load volatile float, float addrspace(1)* %gep.a 926 %b = load volatile float, float addrspace(1)* %gep.b 927 %c = load volatile float, float addrspace(1)* %gep.c 928 929 %cmp0 = icmp eq i32 %tid, 0 930 %cmp1 = icmp ne i32 %d, 0 931 %and = and i1 %cmp0, %cmp1 932 933 %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %and) 934 store float %result, float addrspace(1)* %gep.out, align 4 935 ret void 936} 937 938define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out, [8 x i32], float addrspace(1)* %in, [8 x i32], i32 addrspace(1)* %dummy) { 939; GFX7-LABEL: test_div_fmas_f32_i1_phi_vcc: 940; GFX7: ; %bb.0: ; %entry 941; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 942; GFX7-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x13 943; GFX7-NEXT: v_lshlrev_b32_e32 v1, 2, v0 944; GFX7-NEXT: v_mov_b32_e32 v2, 0 945; GFX7-NEXT: s_mov_b32 s10, 0 946; GFX7-NEXT: s_mov_b32 s11, 0xf000 947; GFX7-NEXT: s_waitcnt lgkmcnt(0) 948; GFX7-NEXT: buffer_load_dwordx3 v[1:3], v[1:2], s[8:11], 0 addr64 949; GFX7-NEXT: s_mov_b32 s6, 0 950; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 951; GFX7-NEXT: s_and_saveexec_b64 s[2:3], vcc 952; GFX7-NEXT: s_cbranch_execz BB13_2 953; GFX7-NEXT: ; %bb.1: ; %bb 954; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x1d 955; GFX7-NEXT: s_waitcnt lgkmcnt(0) 956; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 957; GFX7-NEXT: s_waitcnt lgkmcnt(0) 958; GFX7-NEXT: s_cmp_lg_u32 s0, 0 959; GFX7-NEXT: s_cselect_b32 s6, 1, 0 960; GFX7-NEXT: BB13_2: ; %exit 961; GFX7-NEXT: s_or_b64 exec, exec, s[2:3] 962; GFX7-NEXT: s_and_b32 s0, 1, s6 963; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 964; GFX7-NEXT: s_mov_b32 s10, -1 965; GFX7-NEXT: s_mov_b64 s[6:7], s[10:11] 966; GFX7-NEXT: s_waitcnt vmcnt(0) 967; GFX7-NEXT: s_nop 0 968; GFX7-NEXT: v_div_fmas_f32 v0, v1, v2, v3 969; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:8 970; GFX7-NEXT: s_endpgm 971; 972; GFX8-LABEL: test_div_fmas_f32_i1_phi_vcc: 973; GFX8: ; %bb.0: ; %entry 974; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 975; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c 976; GFX8-NEXT: v_lshlrev_b32_e32 v3, 2, v0 977; GFX8-NEXT: s_mov_b32 s6, 0 978; GFX8-NEXT: s_waitcnt lgkmcnt(0) 979; GFX8-NEXT: v_mov_b32_e32 v1, s4 980; GFX8-NEXT: v_mov_b32_e32 v2, s5 981; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 982; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc 983; GFX8-NEXT: flat_load_dwordx3 v[1:3], v[1:2] 984; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 985; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 986; GFX8-NEXT: s_cbranch_execz BB13_2 987; GFX8-NEXT: ; %bb.1: ; %bb 988; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x74 989; GFX8-NEXT: s_waitcnt lgkmcnt(0) 990; GFX8-NEXT: s_load_dword s0, s[0:1], 0x0 991; GFX8-NEXT: s_waitcnt lgkmcnt(0) 992; GFX8-NEXT: s_cmp_lg_u32 s0, 0 993; GFX8-NEXT: s_cselect_b32 s6, 1, 0 994; GFX8-NEXT: BB13_2: ; %exit 995; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 996; GFX8-NEXT: s_add_u32 s0, s2, 8 997; GFX8-NEXT: s_addc_u32 s1, s3, 0 998; GFX8-NEXT: s_and_b32 s2, 1, s6 999; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 1000; GFX8-NEXT: s_waitcnt vmcnt(0) 1001; GFX8-NEXT: s_nop 2 1002; GFX8-NEXT: v_div_fmas_f32 v2, v1, v2, v3 1003; GFX8-NEXT: v_mov_b32_e32 v0, s0 1004; GFX8-NEXT: v_mov_b32_e32 v1, s1 1005; GFX8-NEXT: flat_store_dword v[0:1], v2 1006; GFX8-NEXT: s_endpgm 1007; 1008; GFX10_W32-LABEL: test_div_fmas_f32_i1_phi_vcc: 1009; GFX10_W32: ; %bb.0: ; %entry 1010; GFX10_W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c 1011; GFX10_W32-NEXT: v_lshlrev_b32_e32 v1, 2, v0 1012; GFX10_W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1013; GFX10_W32-NEXT: s_mov_b32 s5, 0 1014; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) 1015; GFX10_W32-NEXT: global_load_dwordx3 v[1:3], v1, s[2:3] 1016; GFX10_W32-NEXT: s_waitcnt_depctr 0xffe3 1017; GFX10_W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1018; GFX10_W32-NEXT: s_and_saveexec_b32 s4, vcc_lo 1019; GFX10_W32-NEXT: s_cbranch_execz BB13_2 1020; GFX10_W32-NEXT: ; %bb.1: ; %bb 1021; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x74 1022; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) 1023; GFX10_W32-NEXT: s_load_dword s0, s[0:1], 0x0 1024; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) 1025; GFX10_W32-NEXT: s_cmp_lg_u32 s0, 0 1026; GFX10_W32-NEXT: s_cselect_b32 s5, 1, 0 1027; GFX10_W32-NEXT: BB13_2: ; %exit 1028; GFX10_W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 1029; GFX10_W32-NEXT: s_and_b32 s0, 1, s5 1030; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 1031; GFX10_W32-NEXT: s_waitcnt vmcnt(0) 1032; GFX10_W32-NEXT: v_div_fmas_f32 v0, v1, v2, v3 1033; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0 1034; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) 1035; GFX10_W32-NEXT: global_store_dword v1, v0, s[2:3] offset:8 1036; GFX10_W32-NEXT: s_endpgm 1037; 1038; GFX10_W64-LABEL: test_div_fmas_f32_i1_phi_vcc: 1039; GFX10_W64: ; %bb.0: ; %entry 1040; GFX10_W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c 1041; GFX10_W64-NEXT: v_lshlrev_b32_e32 v1, 2, v0 1042; GFX10_W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1043; GFX10_W64-NEXT: s_mov_b32 s6, 0 1044; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) 1045; GFX10_W64-NEXT: global_load_dwordx3 v[1:3], v1, s[2:3] 1046; GFX10_W64-NEXT: s_waitcnt_depctr 0xffe3 1047; GFX10_W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1048; GFX10_W64-NEXT: s_and_saveexec_b64 s[4:5], vcc 1049; GFX10_W64-NEXT: s_cbranch_execz BB13_2 1050; GFX10_W64-NEXT: ; %bb.1: ; %bb 1051; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x74 1052; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) 1053; GFX10_W64-NEXT: s_load_dword s0, s[0:1], 0x0 1054; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) 1055; GFX10_W64-NEXT: s_cmp_lg_u32 s0, 0 1056; GFX10_W64-NEXT: s_cselect_b32 s6, 1, 0 1057; GFX10_W64-NEXT: BB13_2: ; %exit 1058; GFX10_W64-NEXT: s_or_b64 exec, exec, s[4:5] 1059; GFX10_W64-NEXT: s_and_b32 s0, 1, s6 1060; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 1061; GFX10_W64-NEXT: s_waitcnt vmcnt(0) 1062; GFX10_W64-NEXT: v_div_fmas_f32 v0, v1, v2, v3 1063; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0 1064; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) 1065; GFX10_W64-NEXT: global_store_dword v1, v0, s[2:3] offset:8 1066; GFX10_W64-NEXT: s_endpgm 1067entry: 1068 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1069 %gep.out = getelementptr float, float addrspace(1)* %out, i32 2 1070 %gep.a = getelementptr float, float addrspace(1)* %in, i32 %tid 1071 %gep.b = getelementptr float, float addrspace(1)* %gep.a, i32 1 1072 %gep.c = getelementptr float, float addrspace(1)* %gep.a, i32 2 1073 1074 %a = load float, float addrspace(1)* %gep.a 1075 %b = load float, float addrspace(1)* %gep.b 1076 %c = load float, float addrspace(1)* %gep.c 1077 1078 %cmp0 = icmp eq i32 %tid, 0 1079 br i1 %cmp0, label %bb, label %exit 1080 1081bb: 1082 %val = load i32, i32 addrspace(1)* %dummy 1083 %cmp1 = icmp ne i32 %val, 0 1084 br label %exit 1085 1086exit: 1087 %cond = phi i1 [false, %entry], [%cmp1, %bb] 1088 %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %cond) 1089 store float %result, float addrspace(1)* %gep.out, align 4 1090 ret void 1091} 1092 1093declare i32 @llvm.amdgcn.workitem.id.x() #0 1094declare float @llvm.amdgcn.div.fmas.f32(float, float, float, i1) #0 1095declare double @llvm.amdgcn.div.fmas.f64(double, double, double, i1) #0 1096 1097attributes #0 = { nounwind readnone speculatable } 1098