1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck -check-prefixes=GCN,GFX7 %s 3; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s 4; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,GFX10_W32 %s 5; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GCN,GFX10_W64 %s 6 7define float @v_div_fmas_f32(float %a, float %b, float %c, i1 %d) { 8; GFX7-LABEL: v_div_fmas_f32: 9; GFX7: ; %bb.0: 10; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11; GFX7-NEXT: v_and_b32_e32 v3, 1, v3 12; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 13; GFX7-NEXT: s_nop 3 14; GFX7-NEXT: v_div_fmas_f32 v0, v0, v1, v2 15; GFX7-NEXT: s_setpc_b64 s[30:31] 16; 17; GFX8-LABEL: v_div_fmas_f32: 18; GFX8: ; %bb.0: 19; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 20; GFX8-NEXT: v_and_b32_e32 v3, 1, v3 21; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 22; GFX8-NEXT: s_nop 3 23; GFX8-NEXT: v_div_fmas_f32 v0, v0, v1, v2 24; GFX8-NEXT: s_setpc_b64 s[30:31] 25; 26; GFX10_W32-LABEL: v_div_fmas_f32: 27; GFX10_W32: ; %bb.0: 28; GFX10_W32-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 29; GFX10_W32-NEXT: s_waitcnt_vscnt null, 0x0 30; GFX10_W32-NEXT: v_and_b32_e32 v3, 1, v3 31; GFX10_W32-NEXT: ; implicit-def: $vcc_hi 32; GFX10_W32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3 33; GFX10_W32-NEXT: v_div_fmas_f32 v0, v0, v1, v2 34; GFX10_W32-NEXT: s_setpc_b64 s[30:31] 35; 36; GFX10_W64-LABEL: v_div_fmas_f32: 37; GFX10_W64: ; %bb.0: 38; GFX10_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 39; GFX10_W64-NEXT: s_waitcnt_vscnt null, 0x0 40; GFX10_W64-NEXT: v_and_b32_e32 v3, 1, v3 41; GFX10_W64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 42; GFX10_W64-NEXT: v_div_fmas_f32 v0, v0, v1, v2 43; GFX10_W64-NEXT: s_setpc_b64 s[30:31] 44 %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %d) 45 ret float %result 46} 47 48define double @v_div_fmas_f64(double %a, double %b, double %c, i1 %d) { 49; GFX7-LABEL: v_div_fmas_f64: 50; GFX7: ; %bb.0: 51; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 52; GFX7-NEXT: v_and_b32_e32 v6, 1, v6 53; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 54; GFX7-NEXT: s_nop 3 55; GFX7-NEXT: v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5] 56; GFX7-NEXT: s_setpc_b64 s[30:31] 57; 58; GFX8-LABEL: v_div_fmas_f64: 59; GFX8: ; %bb.0: 60; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 61; GFX8-NEXT: v_and_b32_e32 v6, 1, v6 62; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 63; GFX8-NEXT: s_nop 3 64; GFX8-NEXT: v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5] 65; GFX8-NEXT: s_setpc_b64 s[30:31] 66; 67; GFX10_W32-LABEL: v_div_fmas_f64: 68; GFX10_W32: ; %bb.0: 69; GFX10_W32-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 70; GFX10_W32-NEXT: s_waitcnt_vscnt null, 0x0 71; GFX10_W32-NEXT: v_and_b32_e32 v6, 1, v6 72; GFX10_W32-NEXT: ; implicit-def: $vcc_hi 73; GFX10_W32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 74; GFX10_W32-NEXT: v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5] 75; GFX10_W32-NEXT: s_setpc_b64 s[30:31] 76; 77; GFX10_W64-LABEL: v_div_fmas_f64: 78; GFX10_W64: ; %bb.0: 79; GFX10_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 80; GFX10_W64-NEXT: s_waitcnt_vscnt null, 0x0 81; GFX10_W64-NEXT: v_and_b32_e32 v6, 1, v6 82; GFX10_W64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 83; GFX10_W64-NEXT: v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5] 84; GFX10_W64-NEXT: s_setpc_b64 s[30:31] 85 %result = call double @llvm.amdgcn.div.fmas.f64(double %a, double %b, double %c, i1 %d) 86 ret double %result 87} 88 89define amdgpu_ps float @s_div_fmas_f32(float inreg %a, float inreg %b, float inreg %c, i32 inreg %d) { 90; GFX7-LABEL: s_div_fmas_f32: 91; GFX7: ; %bb.0: 92; GFX7-NEXT: s_cmp_eq_u32 s3, 0 93; GFX7-NEXT: s_cselect_b32 s3, 1, 0 94; GFX7-NEXT: v_mov_b32_e32 v0, s0 95; GFX7-NEXT: s_and_b32 s0, 1, s3 96; GFX7-NEXT: v_mov_b32_e32 v1, s1 97; GFX7-NEXT: v_mov_b32_e32 v2, s2 98; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 99; GFX7-NEXT: s_nop 3 100; GFX7-NEXT: v_div_fmas_f32 v0, v0, v1, v2 101; GFX7-NEXT: ; return to shader part epilog 102; 103; GFX8-LABEL: s_div_fmas_f32: 104; GFX8: ; %bb.0: 105; GFX8-NEXT: s_cmp_eq_u32 s3, 0 106; GFX8-NEXT: s_cselect_b32 s3, 1, 0 107; GFX8-NEXT: v_mov_b32_e32 v0, s0 108; GFX8-NEXT: s_and_b32 s0, 1, s3 109; GFX8-NEXT: v_mov_b32_e32 v1, s1 110; GFX8-NEXT: v_mov_b32_e32 v2, s2 111; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 112; GFX8-NEXT: s_nop 3 113; GFX8-NEXT: v_div_fmas_f32 v0, v0, v1, v2 114; GFX8-NEXT: ; return to shader part epilog 115; 116; GFX10_W32-LABEL: s_div_fmas_f32: 117; GFX10_W32: ; %bb.0: 118; GFX10_W32-NEXT: s_cmp_eq_u32 s3, 0 119; GFX10_W32-NEXT: v_mov_b32_e32 v0, s1 120; GFX10_W32-NEXT: v_mov_b32_e32 v1, s2 121; GFX10_W32-NEXT: s_cselect_b32 s3, 1, 0 122; GFX10_W32-NEXT: ; implicit-def: $vcc_hi 123; GFX10_W32-NEXT: s_and_b32 s3, 1, s3 124; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s3 125; GFX10_W32-NEXT: v_div_fmas_f32 v0, s0, v0, v1 126; GFX10_W32-NEXT: ; return to shader part epilog 127; 128; GFX10_W64-LABEL: s_div_fmas_f32: 129; GFX10_W64: ; %bb.0: 130; GFX10_W64-NEXT: s_cmp_eq_u32 s3, 0 131; GFX10_W64-NEXT: v_mov_b32_e32 v0, s1 132; GFX10_W64-NEXT: s_cselect_b32 s3, 1, 0 133; GFX10_W64-NEXT: v_mov_b32_e32 v1, s2 134; GFX10_W64-NEXT: s_and_b32 s3, 1, s3 135; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s3 136; GFX10_W64-NEXT: v_div_fmas_f32 v0, s0, v0, v1 137; GFX10_W64-NEXT: ; return to shader part epilog 138 %vcc = icmp eq i32 %d, 0 139 %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %vcc) 140 ret float %result 141} 142 143define amdgpu_ps double @s_div_fmas_f64(double inreg %a, double inreg %b, double inreg %c, i32 inreg %d) { 144; GFX7-LABEL: s_div_fmas_f64: 145; GFX7: ; %bb.0: 146; GFX7-NEXT: s_cmp_eq_u32 s6, 0 147; GFX7-NEXT: v_mov_b32_e32 v0, s0 148; GFX7-NEXT: s_cselect_b32 s6, 1, 0 149; GFX7-NEXT: v_mov_b32_e32 v1, s1 150; GFX7-NEXT: v_mov_b32_e32 v2, s2 151; GFX7-NEXT: v_mov_b32_e32 v4, s4 152; GFX7-NEXT: s_and_b32 s0, 1, s6 153; GFX7-NEXT: v_mov_b32_e32 v3, s3 154; GFX7-NEXT: v_mov_b32_e32 v5, s5 155; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 156; GFX7-NEXT: s_nop 3 157; GFX7-NEXT: v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5] 158; GFX7-NEXT: v_readfirstlane_b32 s0, v0 159; GFX7-NEXT: v_readfirstlane_b32 s1, v1 160; GFX7-NEXT: ; return to shader part epilog 161; 162; GFX8-LABEL: s_div_fmas_f64: 163; GFX8: ; %bb.0: 164; GFX8-NEXT: s_cmp_eq_u32 s6, 0 165; GFX8-NEXT: v_mov_b32_e32 v0, s0 166; GFX8-NEXT: s_cselect_b32 s6, 1, 0 167; GFX8-NEXT: v_mov_b32_e32 v1, s1 168; GFX8-NEXT: v_mov_b32_e32 v2, s2 169; GFX8-NEXT: v_mov_b32_e32 v4, s4 170; GFX8-NEXT: s_and_b32 s0, 1, s6 171; GFX8-NEXT: v_mov_b32_e32 v3, s3 172; GFX8-NEXT: v_mov_b32_e32 v5, s5 173; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 174; GFX8-NEXT: s_nop 3 175; GFX8-NEXT: v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5] 176; GFX8-NEXT: v_readfirstlane_b32 s0, v0 177; GFX8-NEXT: v_readfirstlane_b32 s1, v1 178; GFX8-NEXT: ; return to shader part epilog 179; 180; GFX10_W32-LABEL: s_div_fmas_f64: 181; GFX10_W32: ; %bb.0: 182; GFX10_W32-NEXT: s_cmp_eq_u32 s6, 0 183; GFX10_W32-NEXT: v_mov_b32_e32 v0, s2 184; GFX10_W32-NEXT: v_mov_b32_e32 v2, s4 185; GFX10_W32-NEXT: v_mov_b32_e32 v1, s3 186; GFX10_W32-NEXT: v_mov_b32_e32 v3, s5 187; GFX10_W32-NEXT: s_cselect_b32 s6, 1, 0 188; GFX10_W32-NEXT: ; implicit-def: $vcc_hi 189; GFX10_W32-NEXT: s_and_b32 s6, 1, s6 190; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s6 191; GFX10_W32-NEXT: v_div_fmas_f64 v[0:1], s[0:1], v[0:1], v[2:3] 192; GFX10_W32-NEXT: v_readfirstlane_b32 s0, v0 193; GFX10_W32-NEXT: v_readfirstlane_b32 s1, v1 194; GFX10_W32-NEXT: ; return to shader part epilog 195; 196; GFX10_W64-LABEL: s_div_fmas_f64: 197; GFX10_W64: ; %bb.0: 198; GFX10_W64-NEXT: s_cmp_eq_u32 s6, 0 199; GFX10_W64-NEXT: v_mov_b32_e32 v0, s2 200; GFX10_W64-NEXT: s_cselect_b32 s6, 1, 0 201; GFX10_W64-NEXT: v_mov_b32_e32 v2, s4 202; GFX10_W64-NEXT: s_and_b32 s6, 1, s6 203; GFX10_W64-NEXT: v_mov_b32_e32 v1, s3 204; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s6 205; GFX10_W64-NEXT: v_mov_b32_e32 v3, s5 206; GFX10_W64-NEXT: v_div_fmas_f64 v[0:1], s[0:1], v[0:1], v[2:3] 207; GFX10_W64-NEXT: v_readfirstlane_b32 s0, v0 208; GFX10_W64-NEXT: v_readfirstlane_b32 s1, v1 209; GFX10_W64-NEXT: ; return to shader part epilog 210 %vcc = icmp eq i32 %d, 0 211 %result = call double @llvm.amdgcn.div.fmas.f64(double %a, double %b, double %c, i1 %vcc) 212 ret double %result 213} 214 215define amdgpu_kernel void @test_div_fmas_f32(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c, [8 x i32], i1 %d) { 216; GFX7-LABEL: test_div_fmas_f32: 217; GFX7: ; %bb.0: 218; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 219; GFX7-NEXT: s_load_dword s2, s[0:1], 0x13 220; GFX7-NEXT: s_load_dword s3, s[0:1], 0x1c 221; GFX7-NEXT: s_load_dword s6, s[0:1], 0x25 222; GFX7-NEXT: s_load_dword s0, s[0:1], 0x2e 223; GFX7-NEXT: s_mov_b32 s7, 0xf000 224; GFX7-NEXT: s_waitcnt lgkmcnt(0) 225; GFX7-NEXT: v_mov_b32_e32 v0, s2 226; GFX7-NEXT: v_mov_b32_e32 v1, s3 227; GFX7-NEXT: v_mov_b32_e32 v2, s6 228; GFX7-NEXT: s_and_b32 s0, 1, s0 229; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 230; GFX7-NEXT: s_mov_b32 s6, -1 231; GFX7-NEXT: s_nop 2 232; GFX7-NEXT: v_div_fmas_f32 v0, v0, v1, v2 233; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 234; GFX7-NEXT: s_endpgm 235; 236; GFX8-LABEL: test_div_fmas_f32: 237; GFX8: ; %bb.0: 238; GFX8-NEXT: s_load_dword s2, s[0:1], 0xb8 239; GFX8-NEXT: s_load_dword s3, s[0:1], 0x4c 240; GFX8-NEXT: s_load_dword s4, s[0:1], 0x70 241; GFX8-NEXT: s_load_dword s5, s[0:1], 0x94 242; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 243; GFX8-NEXT: s_waitcnt lgkmcnt(0) 244; GFX8-NEXT: s_and_b32 s2, 1, s2 245; GFX8-NEXT: v_mov_b32_e32 v0, s3 246; GFX8-NEXT: v_mov_b32_e32 v1, s4 247; GFX8-NEXT: v_mov_b32_e32 v2, s5 248; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 249; GFX8-NEXT: s_nop 3 250; GFX8-NEXT: v_div_fmas_f32 v2, v0, v1, v2 251; GFX8-NEXT: v_mov_b32_e32 v0, s0 252; GFX8-NEXT: v_mov_b32_e32 v1, s1 253; GFX8-NEXT: flat_store_dword v[0:1], v2 254; GFX8-NEXT: s_endpgm 255; 256; GFX10_W32-LABEL: test_div_fmas_f32: 257; GFX10_W32: ; %bb.0: 258; GFX10_W32-NEXT: s_clause 0x4 259; GFX10_W32-NEXT: s_load_dword s2, s[0:1], 0xb8 260; GFX10_W32-NEXT: s_load_dword s3, s[0:1], 0x70 261; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x94 262; GFX10_W32-NEXT: s_load_dword s5, s[0:1], 0x4c 263; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 264; GFX10_W32-NEXT: ; implicit-def: $vcc_hi 265; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) 266; GFX10_W32-NEXT: s_and_b32 s2, 1, s2 267; GFX10_W32-NEXT: v_mov_b32_e32 v0, s3 268; GFX10_W32-NEXT: v_mov_b32_e32 v1, s4 269; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 270; GFX10_W32-NEXT: v_div_fmas_f32 v2, s5, v0, v1 271; GFX10_W32-NEXT: v_mov_b32_e32 v0, s0 272; GFX10_W32-NEXT: v_mov_b32_e32 v1, s1 273; GFX10_W32-NEXT: global_store_dword v[0:1], v2, off 274; GFX10_W32-NEXT: s_endpgm 275; 276; GFX10_W64-LABEL: test_div_fmas_f32: 277; GFX10_W64: ; %bb.0: 278; GFX10_W64-NEXT: s_clause 0x4 279; GFX10_W64-NEXT: s_load_dword s2, s[0:1], 0xb8 280; GFX10_W64-NEXT: s_load_dword s3, s[0:1], 0x70 281; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x94 282; GFX10_W64-NEXT: s_load_dword s5, s[0:1], 0x4c 283; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 284; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) 285; GFX10_W64-NEXT: s_and_b32 s2, 1, s2 286; GFX10_W64-NEXT: v_mov_b32_e32 v0, s3 287; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 288; GFX10_W64-NEXT: v_mov_b32_e32 v1, s4 289; GFX10_W64-NEXT: v_div_fmas_f32 v2, s5, v0, v1 290; GFX10_W64-NEXT: v_mov_b32_e32 v0, s0 291; GFX10_W64-NEXT: v_mov_b32_e32 v1, s1 292; GFX10_W64-NEXT: global_store_dword v[0:1], v2, off 293; GFX10_W64-NEXT: s_endpgm 294 %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %d) 295 store float %result, float addrspace(1)* %out, align 4 296 ret void 297} 298 299define amdgpu_kernel void @test_div_fmas_f32_inline_imm_0(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c, [8 x i32], i1 %d) { 300; GFX7-LABEL: test_div_fmas_f32_inline_imm_0: 301; GFX7: ; %bb.0: 302; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 303; GFX7-NEXT: s_load_dword s2, s[0:1], 0x1c 304; GFX7-NEXT: s_load_dword s3, s[0:1], 0x25 305; GFX7-NEXT: s_load_dword s0, s[0:1], 0x2e 306; GFX7-NEXT: s_mov_b32 s6, -1 307; GFX7-NEXT: s_mov_b32 s7, 0xf000 308; GFX7-NEXT: s_waitcnt lgkmcnt(0) 309; GFX7-NEXT: v_mov_b32_e32 v0, s2 310; GFX7-NEXT: v_mov_b32_e32 v1, s3 311; GFX7-NEXT: s_and_b32 s0, 1, s0 312; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 313; GFX7-NEXT: s_nop 3 314; GFX7-NEXT: v_div_fmas_f32 v0, 1.0, v0, v1 315; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 316; GFX7-NEXT: s_endpgm 317; 318; GFX8-LABEL: test_div_fmas_f32_inline_imm_0: 319; GFX8: ; %bb.0: 320; GFX8-NEXT: s_load_dword s2, s[0:1], 0x70 321; GFX8-NEXT: s_load_dword s3, s[0:1], 0x94 322; GFX8-NEXT: s_load_dword s4, s[0:1], 0xb8 323; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 324; GFX8-NEXT: s_waitcnt lgkmcnt(0) 325; GFX8-NEXT: v_mov_b32_e32 v0, s2 326; GFX8-NEXT: v_mov_b32_e32 v1, s3 327; GFX8-NEXT: s_and_b32 s2, 1, s4 328; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 329; GFX8-NEXT: s_nop 3 330; GFX8-NEXT: v_div_fmas_f32 v2, 1.0, v0, v1 331; GFX8-NEXT: v_mov_b32_e32 v0, s0 332; GFX8-NEXT: v_mov_b32_e32 v1, s1 333; GFX8-NEXT: flat_store_dword v[0:1], v2 334; GFX8-NEXT: s_endpgm 335; 336; GFX10_W32-LABEL: test_div_fmas_f32_inline_imm_0: 337; GFX10_W32: ; %bb.0: 338; GFX10_W32-NEXT: s_clause 0x3 339; GFX10_W32-NEXT: s_load_dword s2, s[0:1], 0xb8 340; GFX10_W32-NEXT: s_load_dword s3, s[0:1], 0x94 341; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x70 342; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 343; GFX10_W32-NEXT: ; implicit-def: $vcc_hi 344; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) 345; GFX10_W32-NEXT: s_and_b32 s2, 1, s2 346; GFX10_W32-NEXT: v_mov_b32_e32 v0, s3 347; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 348; GFX10_W32-NEXT: v_div_fmas_f32 v2, 1.0, s4, v0 349; GFX10_W32-NEXT: v_mov_b32_e32 v0, s0 350; GFX10_W32-NEXT: v_mov_b32_e32 v1, s1 351; GFX10_W32-NEXT: global_store_dword v[0:1], v2, off 352; GFX10_W32-NEXT: s_endpgm 353; 354; GFX10_W64-LABEL: test_div_fmas_f32_inline_imm_0: 355; GFX10_W64: ; %bb.0: 356; GFX10_W64-NEXT: s_clause 0x3 357; GFX10_W64-NEXT: s_load_dword s2, s[0:1], 0xb8 358; GFX10_W64-NEXT: s_load_dword s3, s[0:1], 0x94 359; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x70 360; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 361; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) 362; GFX10_W64-NEXT: s_and_b32 s2, 1, s2 363; GFX10_W64-NEXT: v_mov_b32_e32 v0, s3 364; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 365; GFX10_W64-NEXT: v_div_fmas_f32 v2, 1.0, s4, v0 366; GFX10_W64-NEXT: v_mov_b32_e32 v0, s0 367; GFX10_W64-NEXT: v_mov_b32_e32 v1, s1 368; GFX10_W64-NEXT: global_store_dword v[0:1], v2, off 369; GFX10_W64-NEXT: s_endpgm 370 %result = call float @llvm.amdgcn.div.fmas.f32(float 1.0, float %b, float %c, i1 %d) 371 store float %result, float addrspace(1)* %out, align 4 372 ret void 373} 374 375define amdgpu_kernel void @test_div_fmas_f32_inline_imm_1(float addrspace(1)* %out, float %a, float %b, float %c, [8 x i32], i1 %d) { 376; GFX7-LABEL: test_div_fmas_f32_inline_imm_1: 377; GFX7: ; %bb.0: 378; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 379; GFX7-NEXT: s_load_dword s2, s[0:1], 0xb 380; GFX7-NEXT: s_load_dword s3, s[0:1], 0xd 381; GFX7-NEXT: s_load_dword s0, s[0:1], 0x16 382; GFX7-NEXT: s_mov_b32 s6, -1 383; GFX7-NEXT: s_mov_b32 s7, 0xf000 384; GFX7-NEXT: s_waitcnt lgkmcnt(0) 385; GFX7-NEXT: v_mov_b32_e32 v0, s2 386; GFX7-NEXT: v_mov_b32_e32 v1, s3 387; GFX7-NEXT: s_and_b32 s0, 1, s0 388; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 389; GFX7-NEXT: s_nop 3 390; GFX7-NEXT: v_div_fmas_f32 v0, v0, 1.0, v1 391; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 392; GFX7-NEXT: s_endpgm 393; 394; GFX8-LABEL: test_div_fmas_f32_inline_imm_1: 395; GFX8: ; %bb.0: 396; GFX8-NEXT: s_load_dword s2, s[0:1], 0x2c 397; GFX8-NEXT: s_load_dword s3, s[0:1], 0x34 398; GFX8-NEXT: s_load_dword s4, s[0:1], 0x58 399; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 400; GFX8-NEXT: s_waitcnt lgkmcnt(0) 401; GFX8-NEXT: v_mov_b32_e32 v0, s2 402; GFX8-NEXT: v_mov_b32_e32 v1, s3 403; GFX8-NEXT: s_and_b32 s2, 1, s4 404; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 405; GFX8-NEXT: s_nop 3 406; GFX8-NEXT: v_div_fmas_f32 v2, v0, 1.0, v1 407; GFX8-NEXT: v_mov_b32_e32 v0, s0 408; GFX8-NEXT: v_mov_b32_e32 v1, s1 409; GFX8-NEXT: flat_store_dword v[0:1], v2 410; GFX8-NEXT: s_endpgm 411; 412; GFX10_W32-LABEL: test_div_fmas_f32_inline_imm_1: 413; GFX10_W32: ; %bb.0: 414; GFX10_W32-NEXT: s_clause 0x3 415; GFX10_W32-NEXT: s_load_dword s2, s[0:1], 0x58 416; GFX10_W32-NEXT: s_load_dword s3, s[0:1], 0x34 417; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x2c 418; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 419; GFX10_W32-NEXT: ; implicit-def: $vcc_hi 420; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) 421; GFX10_W32-NEXT: s_and_b32 s2, 1, s2 422; GFX10_W32-NEXT: v_mov_b32_e32 v0, s3 423; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 424; GFX10_W32-NEXT: v_div_fmas_f32 v2, s4, 1.0, v0 425; GFX10_W32-NEXT: v_mov_b32_e32 v0, s0 426; GFX10_W32-NEXT: v_mov_b32_e32 v1, s1 427; GFX10_W32-NEXT: global_store_dword v[0:1], v2, off 428; GFX10_W32-NEXT: s_endpgm 429; 430; GFX10_W64-LABEL: test_div_fmas_f32_inline_imm_1: 431; GFX10_W64: ; %bb.0: 432; GFX10_W64-NEXT: s_clause 0x3 433; GFX10_W64-NEXT: s_load_dword s2, s[0:1], 0x58 434; GFX10_W64-NEXT: s_load_dword s3, s[0:1], 0x34 435; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x2c 436; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 437; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) 438; GFX10_W64-NEXT: s_and_b32 s2, 1, s2 439; GFX10_W64-NEXT: v_mov_b32_e32 v0, s3 440; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 441; GFX10_W64-NEXT: v_div_fmas_f32 v2, s4, 1.0, v0 442; GFX10_W64-NEXT: v_mov_b32_e32 v0, s0 443; GFX10_W64-NEXT: v_mov_b32_e32 v1, s1 444; GFX10_W64-NEXT: global_store_dword v[0:1], v2, off 445; GFX10_W64-NEXT: s_endpgm 446 %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float 1.0, float %c, i1 %d) 447 store float %result, float addrspace(1)* %out, align 4 448 ret void 449} 450 451define amdgpu_kernel void @test_div_fmas_f32_inline_imm_2(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c, [8 x i32], i1 %d) { 452; GFX7-LABEL: test_div_fmas_f32_inline_imm_2: 453; GFX7: ; %bb.0: 454; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 455; GFX7-NEXT: s_load_dword s2, s[0:1], 0x13 456; GFX7-NEXT: s_load_dword s3, s[0:1], 0x1c 457; GFX7-NEXT: s_load_dword s0, s[0:1], 0x2e 458; GFX7-NEXT: s_mov_b32 s6, -1 459; GFX7-NEXT: s_mov_b32 s7, 0xf000 460; GFX7-NEXT: s_waitcnt lgkmcnt(0) 461; GFX7-NEXT: v_mov_b32_e32 v0, s2 462; GFX7-NEXT: v_mov_b32_e32 v1, s3 463; GFX7-NEXT: s_and_b32 s0, 1, s0 464; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 465; GFX7-NEXT: s_nop 3 466; GFX7-NEXT: v_div_fmas_f32 v0, v0, v1, 1.0 467; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 468; GFX7-NEXT: s_endpgm 469; 470; GFX8-LABEL: test_div_fmas_f32_inline_imm_2: 471; GFX8: ; %bb.0: 472; GFX8-NEXT: s_load_dword s2, s[0:1], 0x4c 473; GFX8-NEXT: s_load_dword s3, s[0:1], 0x70 474; GFX8-NEXT: s_load_dword s4, s[0:1], 0xb8 475; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 476; GFX8-NEXT: s_waitcnt lgkmcnt(0) 477; GFX8-NEXT: v_mov_b32_e32 v0, s2 478; GFX8-NEXT: v_mov_b32_e32 v1, s3 479; GFX8-NEXT: s_and_b32 s2, 1, s4 480; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 481; GFX8-NEXT: s_nop 3 482; GFX8-NEXT: v_div_fmas_f32 v2, v0, v1, 1.0 483; GFX8-NEXT: v_mov_b32_e32 v0, s0 484; GFX8-NEXT: v_mov_b32_e32 v1, s1 485; GFX8-NEXT: flat_store_dword v[0:1], v2 486; GFX8-NEXT: s_endpgm 487; 488; GFX10_W32-LABEL: test_div_fmas_f32_inline_imm_2: 489; GFX10_W32: ; %bb.0: 490; GFX10_W32-NEXT: s_clause 0x3 491; GFX10_W32-NEXT: s_load_dword s2, s[0:1], 0xb8 492; GFX10_W32-NEXT: s_load_dword s3, s[0:1], 0x70 493; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x4c 494; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 495; GFX10_W32-NEXT: ; implicit-def: $vcc_hi 496; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) 497; GFX10_W32-NEXT: s_and_b32 s2, 1, s2 498; GFX10_W32-NEXT: v_mov_b32_e32 v0, s3 499; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 500; GFX10_W32-NEXT: v_div_fmas_f32 v2, s4, v0, 1.0 501; GFX10_W32-NEXT: v_mov_b32_e32 v0, s0 502; GFX10_W32-NEXT: v_mov_b32_e32 v1, s1 503; GFX10_W32-NEXT: global_store_dword v[0:1], v2, off 504; GFX10_W32-NEXT: s_endpgm 505; 506; GFX10_W64-LABEL: test_div_fmas_f32_inline_imm_2: 507; GFX10_W64: ; %bb.0: 508; GFX10_W64-NEXT: s_clause 0x3 509; GFX10_W64-NEXT: s_load_dword s2, s[0:1], 0xb8 510; GFX10_W64-NEXT: s_load_dword s3, s[0:1], 0x70 511; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x4c 512; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 513; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) 514; GFX10_W64-NEXT: s_and_b32 s2, 1, s2 515; GFX10_W64-NEXT: v_mov_b32_e32 v0, s3 516; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 517; GFX10_W64-NEXT: v_div_fmas_f32 v2, s4, v0, 1.0 518; GFX10_W64-NEXT: v_mov_b32_e32 v0, s0 519; GFX10_W64-NEXT: v_mov_b32_e32 v1, s1 520; GFX10_W64-NEXT: global_store_dword v[0:1], v2, off 521; GFX10_W64-NEXT: s_endpgm 522 %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float 1.0, i1 %d) 523 store float %result, float addrspace(1)* %out, align 4 524 ret void 525} 526 527define amdgpu_kernel void @test_div_fmas_f64(double addrspace(1)* %out, double %a, double %b, double %c, i1 %d) { 528; GFX7-LABEL: test_div_fmas_f64: 529; GFX7: ; %bb.0: 530; GFX7-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 531; GFX7-NEXT: s_load_dword s0, s[0:1], 0x11 532; GFX7-NEXT: s_waitcnt lgkmcnt(0) 533; GFX7-NEXT: v_mov_b32_e32 v0, s6 534; GFX7-NEXT: v_mov_b32_e32 v2, s8 535; GFX7-NEXT: v_mov_b32_e32 v4, s10 536; GFX7-NEXT: s_and_b32 s0, 1, s0 537; GFX7-NEXT: v_mov_b32_e32 v1, s7 538; GFX7-NEXT: v_mov_b32_e32 v3, s9 539; GFX7-NEXT: v_mov_b32_e32 v5, s11 540; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 541; GFX7-NEXT: s_nop 3 542; GFX7-NEXT: v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5] 543; GFX7-NEXT: v_mov_b32_e32 v2, s4 544; GFX7-NEXT: v_mov_b32_e32 v3, s5 545; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 546; GFX7-NEXT: s_endpgm 547; 548; GFX8-LABEL: test_div_fmas_f64: 549; GFX8: ; %bb.0: 550; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 551; GFX8-NEXT: s_load_dword s0, s[0:1], 0x44 552; GFX8-NEXT: s_waitcnt lgkmcnt(0) 553; GFX8-NEXT: v_mov_b32_e32 v0, s6 554; GFX8-NEXT: v_mov_b32_e32 v2, s8 555; GFX8-NEXT: v_mov_b32_e32 v4, s10 556; GFX8-NEXT: s_and_b32 s0, 1, s0 557; GFX8-NEXT: v_mov_b32_e32 v1, s7 558; GFX8-NEXT: v_mov_b32_e32 v3, s9 559; GFX8-NEXT: v_mov_b32_e32 v5, s11 560; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 561; GFX8-NEXT: s_nop 3 562; GFX8-NEXT: v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5] 563; GFX8-NEXT: v_mov_b32_e32 v2, s4 564; GFX8-NEXT: v_mov_b32_e32 v3, s5 565; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 566; GFX8-NEXT: s_endpgm 567; 568; GFX10_W32-LABEL: test_div_fmas_f64: 569; GFX10_W32: ; %bb.0: 570; GFX10_W32-NEXT: s_clause 0x1 571; GFX10_W32-NEXT: s_load_dword s8, s[0:1], 0x44 572; GFX10_W32-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 573; GFX10_W32-NEXT: ; implicit-def: $vcc_hi 574; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) 575; GFX10_W32-NEXT: s_and_b32 s8, 1, s8 576; GFX10_W32-NEXT: v_mov_b32_e32 v0, s4 577; GFX10_W32-NEXT: v_mov_b32_e32 v2, s6 578; GFX10_W32-NEXT: v_mov_b32_e32 v1, s5 579; GFX10_W32-NEXT: v_mov_b32_e32 v3, s7 580; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s8 581; GFX10_W32-NEXT: v_div_fmas_f64 v[0:1], s[2:3], v[0:1], v[2:3] 582; GFX10_W32-NEXT: v_mov_b32_e32 v3, s1 583; GFX10_W32-NEXT: v_mov_b32_e32 v2, s0 584; GFX10_W32-NEXT: global_store_dwordx2 v[2:3], v[0:1], off 585; GFX10_W32-NEXT: s_endpgm 586; 587; GFX10_W64-LABEL: test_div_fmas_f64: 588; GFX10_W64: ; %bb.0: 589; GFX10_W64-NEXT: s_clause 0x1 590; GFX10_W64-NEXT: s_load_dword s8, s[0:1], 0x44 591; GFX10_W64-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 592; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) 593; GFX10_W64-NEXT: s_and_b32 s8, 1, s8 594; GFX10_W64-NEXT: v_mov_b32_e32 v0, s4 595; GFX10_W64-NEXT: v_mov_b32_e32 v2, s6 596; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s8 597; GFX10_W64-NEXT: v_mov_b32_e32 v1, s5 598; GFX10_W64-NEXT: v_mov_b32_e32 v3, s7 599; GFX10_W64-NEXT: v_div_fmas_f64 v[0:1], s[2:3], v[0:1], v[2:3] 600; GFX10_W64-NEXT: v_mov_b32_e32 v3, s1 601; GFX10_W64-NEXT: v_mov_b32_e32 v2, s0 602; GFX10_W64-NEXT: global_store_dwordx2 v[2:3], v[0:1], off 603; GFX10_W64-NEXT: s_endpgm 604 %result = call double @llvm.amdgcn.div.fmas.f64(double %a, double %b, double %c, i1 %d) 605 store double %result, double addrspace(1)* %out, align 8 606 ret void 607} 608 609define amdgpu_kernel void @test_div_fmas_f32_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c, i32 %i) { 610; GFX7-LABEL: test_div_fmas_f32_cond_to_vcc: 611; GFX7: ; %bb.0: 612; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 613; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb 614; GFX7-NEXT: s_mov_b32 s6, -1 615; GFX7-NEXT: s_mov_b32 s7, 0xf000 616; GFX7-NEXT: s_waitcnt lgkmcnt(0) 617; GFX7-NEXT: s_cmp_eq_u32 s3, 0 618; GFX7-NEXT: s_cselect_b32 s3, 1, 0 619; GFX7-NEXT: v_mov_b32_e32 v0, s0 620; GFX7-NEXT: s_and_b32 s0, 1, s3 621; GFX7-NEXT: v_mov_b32_e32 v1, s1 622; GFX7-NEXT: v_mov_b32_e32 v2, s2 623; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 624; GFX7-NEXT: s_nop 3 625; GFX7-NEXT: v_div_fmas_f32 v0, v0, v1, v2 626; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 627; GFX7-NEXT: s_endpgm 628; 629; GFX8-LABEL: test_div_fmas_f32_cond_to_vcc: 630; GFX8: ; %bb.0: 631; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 632; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 633; GFX8-NEXT: s_waitcnt lgkmcnt(0) 634; GFX8-NEXT: s_cmp_eq_u32 s7, 0 635; GFX8-NEXT: s_cselect_b32 s2, 1, 0 636; GFX8-NEXT: s_and_b32 s2, 1, s2 637; GFX8-NEXT: v_mov_b32_e32 v0, s4 638; GFX8-NEXT: v_mov_b32_e32 v1, s5 639; GFX8-NEXT: v_mov_b32_e32 v2, s6 640; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 641; GFX8-NEXT: s_nop 3 642; GFX8-NEXT: v_div_fmas_f32 v2, v0, v1, v2 643; GFX8-NEXT: v_mov_b32_e32 v0, s0 644; GFX8-NEXT: v_mov_b32_e32 v1, s1 645; GFX8-NEXT: flat_store_dword v[0:1], v2 646; GFX8-NEXT: s_endpgm 647; 648; GFX10_W32-LABEL: test_div_fmas_f32_cond_to_vcc: 649; GFX10_W32: ; %bb.0: 650; GFX10_W32-NEXT: s_clause 0x1 651; GFX10_W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 652; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 653; GFX10_W32-NEXT: ; implicit-def: $vcc_hi 654; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) 655; GFX10_W32-NEXT: s_cmp_eq_u32 s7, 0 656; GFX10_W32-NEXT: v_mov_b32_e32 v0, s5 657; GFX10_W32-NEXT: v_mov_b32_e32 v1, s6 658; GFX10_W32-NEXT: s_cselect_b32 s2, 1, 0 659; GFX10_W32-NEXT: s_and_b32 s2, 1, s2 660; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 661; GFX10_W32-NEXT: v_div_fmas_f32 v2, s4, v0, v1 662; GFX10_W32-NEXT: v_mov_b32_e32 v0, s0 663; GFX10_W32-NEXT: v_mov_b32_e32 v1, s1 664; GFX10_W32-NEXT: global_store_dword v[0:1], v2, off 665; GFX10_W32-NEXT: s_endpgm 666; 667; GFX10_W64-LABEL: test_div_fmas_f32_cond_to_vcc: 668; GFX10_W64: ; %bb.0: 669; GFX10_W64-NEXT: s_clause 0x1 670; GFX10_W64-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 671; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 672; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) 673; GFX10_W64-NEXT: s_cmp_eq_u32 s7, 0 674; GFX10_W64-NEXT: v_mov_b32_e32 v0, s5 675; GFX10_W64-NEXT: s_cselect_b32 s2, 1, 0 676; GFX10_W64-NEXT: v_mov_b32_e32 v1, s6 677; GFX10_W64-NEXT: s_and_b32 s2, 1, s2 678; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 679; GFX10_W64-NEXT: v_div_fmas_f32 v2, s4, v0, v1 680; GFX10_W64-NEXT: v_mov_b32_e32 v0, s0 681; GFX10_W64-NEXT: v_mov_b32_e32 v1, s1 682; GFX10_W64-NEXT: global_store_dword v[0:1], v2, off 683; GFX10_W64-NEXT: s_endpgm 684 %cmp = icmp eq i32 %i, 0 685 %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %cmp) 686 store float %result, float addrspace(1)* %out, align 4 687 ret void 688} 689 690define amdgpu_kernel void @test_div_fmas_f32_imm_false_cond_to_vcc(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c) { 691; GFX7-LABEL: test_div_fmas_f32_imm_false_cond_to_vcc: 692; GFX7: ; %bb.0: 693; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 694; GFX7-NEXT: s_load_dword s2, s[0:1], 0x13 695; GFX7-NEXT: s_load_dword s3, s[0:1], 0x1c 696; GFX7-NEXT: s_load_dword s0, s[0:1], 0x25 697; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, 0 698; GFX7-NEXT: s_mov_b32 s6, -1 699; GFX7-NEXT: s_waitcnt lgkmcnt(0) 700; GFX7-NEXT: v_mov_b32_e32 v0, s2 701; GFX7-NEXT: v_mov_b32_e32 v1, s3 702; GFX7-NEXT: v_mov_b32_e32 v2, s0 703; GFX7-NEXT: v_div_fmas_f32 v0, v0, v1, v2 704; GFX7-NEXT: s_mov_b32 s7, 0xf000 705; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 706; GFX7-NEXT: s_endpgm 707; 708; GFX8-LABEL: test_div_fmas_f32_imm_false_cond_to_vcc: 709; GFX8: ; %bb.0: 710; GFX8-NEXT: s_load_dword s2, s[0:1], 0x4c 711; GFX8-NEXT: s_load_dword s3, s[0:1], 0x70 712; GFX8-NEXT: s_load_dword s4, s[0:1], 0x94 713; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, 0 714; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 715; GFX8-NEXT: s_waitcnt lgkmcnt(0) 716; GFX8-NEXT: v_mov_b32_e32 v0, s2 717; GFX8-NEXT: v_mov_b32_e32 v1, s3 718; GFX8-NEXT: v_mov_b32_e32 v2, s4 719; GFX8-NEXT: v_div_fmas_f32 v2, v0, v1, v2 720; GFX8-NEXT: v_mov_b32_e32 v0, s0 721; GFX8-NEXT: v_mov_b32_e32 v1, s1 722; GFX8-NEXT: flat_store_dword v[0:1], v2 723; GFX8-NEXT: s_endpgm 724; 725; GFX10_W32-LABEL: test_div_fmas_f32_imm_false_cond_to_vcc: 726; GFX10_W32: ; %bb.0: 727; GFX10_W32-NEXT: s_clause 0x3 728; GFX10_W32-NEXT: s_load_dword s2, s[0:1], 0x70 729; GFX10_W32-NEXT: s_load_dword s3, s[0:1], 0x94 730; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x4c 731; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 732; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, 0 733; GFX10_W32-NEXT: ; implicit-def: $vcc_hi 734; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) 735; GFX10_W32-NEXT: v_mov_b32_e32 v0, s2 736; GFX10_W32-NEXT: v_mov_b32_e32 v1, s3 737; GFX10_W32-NEXT: v_div_fmas_f32 v2, s4, v0, v1 738; GFX10_W32-NEXT: v_mov_b32_e32 v0, s0 739; GFX10_W32-NEXT: v_mov_b32_e32 v1, s1 740; GFX10_W32-NEXT: global_store_dword v[0:1], v2, off 741; GFX10_W32-NEXT: s_endpgm 742; 743; GFX10_W64-LABEL: test_div_fmas_f32_imm_false_cond_to_vcc: 744; GFX10_W64: ; %bb.0: 745; GFX10_W64-NEXT: s_clause 0x3 746; GFX10_W64-NEXT: s_load_dword s2, s[0:1], 0x70 747; GFX10_W64-NEXT: s_load_dword s3, s[0:1], 0x94 748; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x4c 749; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 750; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, 0 751; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) 752; GFX10_W64-NEXT: v_mov_b32_e32 v0, s2 753; GFX10_W64-NEXT: v_mov_b32_e32 v1, s3 754; GFX10_W64-NEXT: v_div_fmas_f32 v2, s4, v0, v1 755; GFX10_W64-NEXT: v_mov_b32_e32 v0, s0 756; GFX10_W64-NEXT: v_mov_b32_e32 v1, s1 757; GFX10_W64-NEXT: global_store_dword v[0:1], v2, off 758; GFX10_W64-NEXT: s_endpgm 759 %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 false) 760 store float %result, float addrspace(1)* %out, align 4 761 ret void 762} 763 764define amdgpu_kernel void @test_div_fmas_f32_imm_true_cond_to_vcc(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c) { 765; GFX7-LABEL: test_div_fmas_f32_imm_true_cond_to_vcc: 766; GFX7: ; %bb.0: 767; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 768; GFX7-NEXT: s_load_dword s2, s[0:1], 0x13 769; GFX7-NEXT: s_load_dword s3, s[0:1], 0x1c 770; GFX7-NEXT: s_load_dword s0, s[0:1], 0x25 771; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, 1 772; GFX7-NEXT: s_mov_b32 s6, -1 773; GFX7-NEXT: s_waitcnt lgkmcnt(0) 774; GFX7-NEXT: v_mov_b32_e32 v0, s2 775; GFX7-NEXT: v_mov_b32_e32 v1, s3 776; GFX7-NEXT: v_mov_b32_e32 v2, s0 777; GFX7-NEXT: v_div_fmas_f32 v0, v0, v1, v2 778; GFX7-NEXT: s_mov_b32 s7, 0xf000 779; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 780; GFX7-NEXT: s_endpgm 781; 782; GFX8-LABEL: test_div_fmas_f32_imm_true_cond_to_vcc: 783; GFX8: ; %bb.0: 784; GFX8-NEXT: s_load_dword s2, s[0:1], 0x4c 785; GFX8-NEXT: s_load_dword s3, s[0:1], 0x70 786; GFX8-NEXT: s_load_dword s4, s[0:1], 0x94 787; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, 1 788; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 789; GFX8-NEXT: s_waitcnt lgkmcnt(0) 790; GFX8-NEXT: v_mov_b32_e32 v0, s2 791; GFX8-NEXT: v_mov_b32_e32 v1, s3 792; GFX8-NEXT: v_mov_b32_e32 v2, s4 793; GFX8-NEXT: v_div_fmas_f32 v2, v0, v1, v2 794; GFX8-NEXT: v_mov_b32_e32 v0, s0 795; GFX8-NEXT: v_mov_b32_e32 v1, s1 796; GFX8-NEXT: flat_store_dword v[0:1], v2 797; GFX8-NEXT: s_endpgm 798; 799; GFX10_W32-LABEL: test_div_fmas_f32_imm_true_cond_to_vcc: 800; GFX10_W32: ; %bb.0: 801; GFX10_W32-NEXT: s_clause 0x3 802; GFX10_W32-NEXT: s_load_dword s2, s[0:1], 0x70 803; GFX10_W32-NEXT: s_load_dword s3, s[0:1], 0x94 804; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x4c 805; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 806; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, 1 807; GFX10_W32-NEXT: ; implicit-def: $vcc_hi 808; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) 809; GFX10_W32-NEXT: v_mov_b32_e32 v0, s2 810; GFX10_W32-NEXT: v_mov_b32_e32 v1, s3 811; GFX10_W32-NEXT: v_div_fmas_f32 v2, s4, v0, v1 812; GFX10_W32-NEXT: v_mov_b32_e32 v0, s0 813; GFX10_W32-NEXT: v_mov_b32_e32 v1, s1 814; GFX10_W32-NEXT: global_store_dword v[0:1], v2, off 815; GFX10_W32-NEXT: s_endpgm 816; 817; GFX10_W64-LABEL: test_div_fmas_f32_imm_true_cond_to_vcc: 818; GFX10_W64: ; %bb.0: 819; GFX10_W64-NEXT: s_clause 0x3 820; GFX10_W64-NEXT: s_load_dword s2, s[0:1], 0x70 821; GFX10_W64-NEXT: s_load_dword s3, s[0:1], 0x94 822; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x4c 823; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 824; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, 1 825; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) 826; GFX10_W64-NEXT: v_mov_b32_e32 v0, s2 827; GFX10_W64-NEXT: v_mov_b32_e32 v1, s3 828; GFX10_W64-NEXT: v_div_fmas_f32 v2, s4, v0, v1 829; GFX10_W64-NEXT: v_mov_b32_e32 v0, s0 830; GFX10_W64-NEXT: v_mov_b32_e32 v1, s1 831; GFX10_W64-NEXT: global_store_dword v[0:1], v2, off 832; GFX10_W64-NEXT: s_endpgm 833 %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 true) 834 store float %result, float addrspace(1)* %out, align 4 835 ret void 836} 837 838define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace(1)* %out, float addrspace(1)* %in, [8 x i32], i32 %d) { 839; GFX7-LABEL: test_div_fmas_f32_logical_cond_to_vcc: 840; GFX7: ; %bb.0: 841; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 842; GFX7-NEXT: s_load_dword s8, s[0:1], 0x15 843; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0 844; GFX7-NEXT: v_lshl_b64 v[1:2], v[0:1], 2 845; GFX7-NEXT: s_mov_b32 s2, 0 846; GFX7-NEXT: s_mov_b32 s3, 0xf000 847; GFX7-NEXT: s_waitcnt lgkmcnt(0) 848; GFX7-NEXT: s_mov_b64 s[0:1], s[6:7] 849; GFX7-NEXT: buffer_load_dword v3, v[1:2], s[0:3], 0 addr64 850; GFX7-NEXT: buffer_load_dword v4, v[1:2], s[0:3], 0 addr64 offset:4 851; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 852; GFX7-NEXT: buffer_load_dword v0, v[1:2], s[0:3], 0 addr64 offset:8 853; GFX7-NEXT: s_cmp_lg_u32 s8, 0 854; GFX7-NEXT: s_cselect_b32 s6, 1, 0 855; GFX7-NEXT: s_and_b32 s0, 1, s6 856; GFX7-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 857; GFX7-NEXT: s_mov_b32 s2, -1 858; GFX7-NEXT: s_and_b64 vcc, vcc, s[0:1] 859; GFX7-NEXT: s_mov_b64 s[6:7], s[2:3] 860; GFX7-NEXT: s_waitcnt vmcnt(0) 861; GFX7-NEXT: v_div_fmas_f32 v0, v3, v4, v0 862; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:8 863; GFX7-NEXT: s_endpgm 864; 865; GFX8-LABEL: test_div_fmas_f32_logical_cond_to_vcc: 866; GFX8: ; %bb.0: 867; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 868; GFX8-NEXT: s_load_dword s2, s[0:1], 0x54 869; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 870; GFX8-NEXT: v_lshlrev_b64 v[1:2], 2, v[0:1] 871; GFX8-NEXT: s_waitcnt lgkmcnt(0) 872; GFX8-NEXT: v_mov_b32_e32 v3, s6 873; GFX8-NEXT: v_mov_b32_e32 v4, s7 874; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 875; GFX8-NEXT: v_addc_u32_e32 v2, vcc, v4, v2, vcc 876; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v1 877; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc 878; GFX8-NEXT: v_add_u32_e32 v5, vcc, 8, v1 879; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v2, vcc 880; GFX8-NEXT: flat_load_dword v1, v[1:2] 881; GFX8-NEXT: flat_load_dword v2, v[3:4] 882; GFX8-NEXT: flat_load_dword v3, v[5:6] 883; GFX8-NEXT: s_add_u32 s0, s4, 8 884; GFX8-NEXT: s_addc_u32 s1, s5, 0 885; GFX8-NEXT: s_cmp_lg_u32 s2, 0 886; GFX8-NEXT: s_cselect_b32 s2, 1, 0 887; GFX8-NEXT: s_and_b32 s2, 1, s2 888; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 889; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, s2 890; GFX8-NEXT: s_and_b64 vcc, vcc, s[2:3] 891; GFX8-NEXT: s_nop 1 892; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 893; GFX8-NEXT: v_div_fmas_f32 v2, v1, v2, v3 894; GFX8-NEXT: v_mov_b32_e32 v0, s0 895; GFX8-NEXT: v_mov_b32_e32 v1, s1 896; GFX8-NEXT: flat_store_dword v[0:1], v2 897; GFX8-NEXT: s_endpgm 898; 899; GFX10_W32-LABEL: test_div_fmas_f32_logical_cond_to_vcc: 900; GFX10_W32: ; %bb.0: 901; GFX10_W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 902; GFX10_W32-NEXT: v_ashrrev_i32_e32 v1, 31, v0 903; GFX10_W32-NEXT: s_load_dword s2, s[0:1], 0x54 904; GFX10_W32-NEXT: ; implicit-def: $vcc_hi 905; GFX10_W32-NEXT: v_lshlrev_b64 v[1:2], 2, v[0:1] 906; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) 907; GFX10_W32-NEXT: v_mov_b32_e32 v3, s6 908; GFX10_W32-NEXT: v_mov_b32_e32 v4, s7 909; GFX10_W32-NEXT: s_add_u32 s0, s4, 8 910; GFX10_W32-NEXT: s_addc_u32 s1, s5, 0 911; GFX10_W32-NEXT: s_cmp_lg_u32 s2, 0 912; GFX10_W32-NEXT: v_add_co_u32_e64 v1, vcc_lo, v3, v1 913; GFX10_W32-NEXT: s_cselect_b32 s2, 1, 0 914; GFX10_W32-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v4, v2, vcc_lo 915; GFX10_W32-NEXT: s_and_b32 s2, 1, s2 916; GFX10_W32-NEXT: v_add_co_u32_e64 v3, vcc_lo, v1, 8 917; GFX10_W32-NEXT: v_cmp_ne_u32_e64 s2, 0, s2 918; GFX10_W32-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v2, vcc_lo 919; GFX10_W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 920; GFX10_W32-NEXT: s_clause 0x2 921; GFX10_W32-NEXT: global_load_dword v1, v[1:2], off 922; GFX10_W32-NEXT: global_load_dword v2, v[3:4], off offset:-4 923; GFX10_W32-NEXT: global_load_dword v3, v[3:4], off 924; GFX10_W32-NEXT: s_and_b32 vcc_lo, vcc_lo, s2 925; GFX10_W32-NEXT: s_waitcnt vmcnt(0) 926; GFX10_W32-NEXT: v_div_fmas_f32 v2, v1, v2, v3 927; GFX10_W32-NEXT: v_mov_b32_e32 v0, s0 928; GFX10_W32-NEXT: v_mov_b32_e32 v1, s1 929; GFX10_W32-NEXT: global_store_dword v[0:1], v2, off 930; GFX10_W32-NEXT: s_endpgm 931; 932; GFX10_W64-LABEL: test_div_fmas_f32_logical_cond_to_vcc: 933; GFX10_W64: ; %bb.0: 934; GFX10_W64-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 935; GFX10_W64-NEXT: v_ashrrev_i32_e32 v1, 31, v0 936; GFX10_W64-NEXT: s_load_dword s2, s[0:1], 0x54 937; GFX10_W64-NEXT: v_lshlrev_b64 v[1:2], 2, v[0:1] 938; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) 939; GFX10_W64-NEXT: v_mov_b32_e32 v3, s6 940; GFX10_W64-NEXT: v_mov_b32_e32 v4, s7 941; GFX10_W64-NEXT: s_add_u32 s0, s4, 8 942; GFX10_W64-NEXT: s_addc_u32 s1, s5, 0 943; GFX10_W64-NEXT: s_cmp_lg_u32 s2, 0 944; GFX10_W64-NEXT: v_add_co_u32_e64 v1, vcc, v3, v1 945; GFX10_W64-NEXT: s_cselect_b32 s2, 1, 0 946; GFX10_W64-NEXT: v_add_co_ci_u32_e32 v2, vcc, v4, v2, vcc 947; GFX10_W64-NEXT: s_and_b32 s2, 1, s2 948; GFX10_W64-NEXT: v_add_co_u32_e64 v3, vcc, v1, 8 949; GFX10_W64-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, s2 950; GFX10_W64-NEXT: v_add_co_ci_u32_e32 v4, vcc, 0, v2, vcc 951; GFX10_W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 952; GFX10_W64-NEXT: s_clause 0x2 953; GFX10_W64-NEXT: global_load_dword v1, v[1:2], off 954; GFX10_W64-NEXT: global_load_dword v2, v[3:4], off offset:-4 955; GFX10_W64-NEXT: global_load_dword v3, v[3:4], off 956; GFX10_W64-NEXT: s_and_b64 vcc, vcc, s[2:3] 957; GFX10_W64-NEXT: s_waitcnt vmcnt(0) 958; GFX10_W64-NEXT: v_div_fmas_f32 v2, v1, v2, v3 959; GFX10_W64-NEXT: v_mov_b32_e32 v0, s0 960; GFX10_W64-NEXT: v_mov_b32_e32 v1, s1 961; GFX10_W64-NEXT: global_store_dword v[0:1], v2, off 962; GFX10_W64-NEXT: s_endpgm 963 %tid = call i32 @llvm.amdgcn.workitem.id.x() 964 %gep.a = getelementptr float, float addrspace(1)* %in, i32 %tid 965 %gep.b = getelementptr float, float addrspace(1)* %gep.a, i32 1 966 %gep.c = getelementptr float, float addrspace(1)* %gep.a, i32 2 967 %gep.out = getelementptr float, float addrspace(1)* %out, i32 2 968 969 %a = load volatile float, float addrspace(1)* %gep.a 970 %b = load volatile float, float addrspace(1)* %gep.b 971 %c = load volatile float, float addrspace(1)* %gep.c 972 973 %cmp0 = icmp eq i32 %tid, 0 974 %cmp1 = icmp ne i32 %d, 0 975 %and = and i1 %cmp0, %cmp1 976 977 %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %and) 978 store float %result, float addrspace(1)* %gep.out, align 4 979 ret void 980} 981 982define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out, [8 x i32], float addrspace(1)* %in, [8 x i32], i32 addrspace(1)* %dummy) { 983; GFX7-LABEL: test_div_fmas_f32_i1_phi_vcc: 984; GFX7: ; %bb.0: ; %entry 985; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 986; GFX7-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x13 987; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0 988; GFX7-NEXT: v_lshl_b64 v[1:2], v[0:1], 2 989; GFX7-NEXT: s_mov_b32 s10, 0 990; GFX7-NEXT: s_mov_b32 s11, 0xf000 991; GFX7-NEXT: s_waitcnt lgkmcnt(0) 992; GFX7-NEXT: buffer_load_dwordx3 v[1:3], v[1:2], s[8:11], 0 addr64 993; GFX7-NEXT: s_mov_b32 s2, 0 994; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 995; GFX7-NEXT: s_and_saveexec_b64 s[6:7], vcc 996; GFX7-NEXT: s_cbranch_execz BB13_2 997; GFX7-NEXT: ; %bb.1: ; %bb 998; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x1d 999; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1000; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 1001; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1002; GFX7-NEXT: s_cmp_lg_u32 s0, 0 1003; GFX7-NEXT: s_cselect_b32 s2, 1, 0 1004; GFX7-NEXT: BB13_2: ; %exit 1005; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] 1006; GFX7-NEXT: s_and_b32 s0, 1, s2 1007; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 1008; GFX7-NEXT: s_mov_b32 s10, -1 1009; GFX7-NEXT: s_mov_b64 s[6:7], s[10:11] 1010; GFX7-NEXT: s_nop 1 1011; GFX7-NEXT: s_waitcnt vmcnt(0) 1012; GFX7-NEXT: v_div_fmas_f32 v0, v1, v2, v3 1013; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:8 1014; GFX7-NEXT: s_endpgm 1015; 1016; GFX8-LABEL: test_div_fmas_f32_i1_phi_vcc: 1017; GFX8: ; %bb.0: ; %entry 1018; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1019; GFX8-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x4c 1020; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 1021; GFX8-NEXT: v_lshlrev_b64 v[1:2], 2, v[0:1] 1022; GFX8-NEXT: s_mov_b32 s2, 0 1023; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1024; GFX8-NEXT: v_mov_b32_e32 v3, s6 1025; GFX8-NEXT: v_mov_b32_e32 v4, s7 1026; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 1027; GFX8-NEXT: v_addc_u32_e32 v2, vcc, v4, v2, vcc 1028; GFX8-NEXT: flat_load_dwordx3 v[1:3], v[1:2] 1029; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1030; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc 1031; GFX8-NEXT: s_cbranch_execz BB13_2 1032; GFX8-NEXT: ; %bb.1: ; %bb 1033; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x74 1034; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1035; GFX8-NEXT: s_load_dword s0, s[0:1], 0x0 1036; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1037; GFX8-NEXT: s_cmp_lg_u32 s0, 0 1038; GFX8-NEXT: s_cselect_b32 s2, 1, 0 1039; GFX8-NEXT: BB13_2: ; %exit 1040; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 1041; GFX8-NEXT: s_add_u32 s0, s4, 8 1042; GFX8-NEXT: s_addc_u32 s1, s5, 0 1043; GFX8-NEXT: s_and_b32 s2, 1, s2 1044; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 1045; GFX8-NEXT: s_nop 3 1046; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1047; GFX8-NEXT: v_div_fmas_f32 v2, v1, v2, v3 1048; GFX8-NEXT: v_mov_b32_e32 v0, s0 1049; GFX8-NEXT: v_mov_b32_e32 v1, s1 1050; GFX8-NEXT: flat_store_dword v[0:1], v2 1051; GFX8-NEXT: s_endpgm 1052; 1053; GFX10_W32-LABEL: test_div_fmas_f32_i1_phi_vcc: 1054; GFX10_W32: ; %bb.0: ; %entry 1055; GFX10_W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c 1056; GFX10_W32-NEXT: v_ashrrev_i32_e32 v1, 31, v0 1057; GFX10_W32-NEXT: s_mov_b32 s4, 0 1058; GFX10_W32-NEXT: ; implicit-def: $vcc_hi 1059; GFX10_W32-NEXT: v_lshlrev_b64 v[1:2], 2, v[0:1] 1060; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) 1061; GFX10_W32-NEXT: v_mov_b32_e32 v4, s3 1062; GFX10_W32-NEXT: v_mov_b32_e32 v3, s2 1063; GFX10_W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1064; GFX10_W32-NEXT: v_add_co_u32_e64 v1, vcc_lo, v3, v1 1065; GFX10_W32-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v4, v2, vcc_lo 1066; GFX10_W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1067; GFX10_W32-NEXT: global_load_dwordx3 v[1:3], v[1:2], off 1068; GFX10_W32-NEXT: s_and_saveexec_b32 s5, vcc_lo 1069; GFX10_W32-NEXT: s_cbranch_execz BB13_2 1070; GFX10_W32-NEXT: ; %bb.1: ; %bb 1071; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x74 1072; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) 1073; GFX10_W32-NEXT: s_load_dword s0, s[0:1], 0x0 1074; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) 1075; GFX10_W32-NEXT: s_cmp_lg_u32 s0, 0 1076; GFX10_W32-NEXT: s_cselect_b32 s4, 1, 0 1077; GFX10_W32-NEXT: BB13_2: ; %exit 1078; GFX10_W32-NEXT: v_nop 1079; GFX10_W32-NEXT: s_or_b32 exec_lo, exec_lo, s5 1080; GFX10_W32-NEXT: s_and_b32 s0, 1, s4 1081; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 1082; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) 1083; GFX10_W32-NEXT: s_add_u32 s0, s2, 8 1084; GFX10_W32-NEXT: s_addc_u32 s1, s3, 0 1085; GFX10_W32-NEXT: s_waitcnt vmcnt(0) 1086; GFX10_W32-NEXT: v_div_fmas_f32 v2, v1, v2, v3 1087; GFX10_W32-NEXT: v_mov_b32_e32 v0, s0 1088; GFX10_W32-NEXT: v_mov_b32_e32 v1, s1 1089; GFX10_W32-NEXT: global_store_dword v[0:1], v2, off 1090; GFX10_W32-NEXT: s_endpgm 1091; 1092; GFX10_W64-LABEL: test_div_fmas_f32_i1_phi_vcc: 1093; GFX10_W64: ; %bb.0: ; %entry 1094; GFX10_W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c 1095; GFX10_W64-NEXT: v_ashrrev_i32_e32 v1, 31, v0 1096; GFX10_W64-NEXT: s_mov_b32 s6, 0 1097; GFX10_W64-NEXT: v_lshlrev_b64 v[1:2], 2, v[0:1] 1098; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) 1099; GFX10_W64-NEXT: v_mov_b32_e32 v4, s3 1100; GFX10_W64-NEXT: v_mov_b32_e32 v3, s2 1101; GFX10_W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1102; GFX10_W64-NEXT: v_add_co_u32_e64 v1, vcc, v3, v1 1103; GFX10_W64-NEXT: v_add_co_ci_u32_e32 v2, vcc, v4, v2, vcc 1104; GFX10_W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1105; GFX10_W64-NEXT: global_load_dwordx3 v[1:3], v[1:2], off 1106; GFX10_W64-NEXT: s_and_saveexec_b64 s[4:5], vcc 1107; GFX10_W64-NEXT: s_cbranch_execz BB13_2 1108; GFX10_W64-NEXT: ; %bb.1: ; %bb 1109; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x74 1110; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) 1111; GFX10_W64-NEXT: s_load_dword s0, s[0:1], 0x0 1112; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) 1113; GFX10_W64-NEXT: s_cmp_lg_u32 s0, 0 1114; GFX10_W64-NEXT: s_cselect_b32 s6, 1, 0 1115; GFX10_W64-NEXT: BB13_2: ; %exit 1116; GFX10_W64-NEXT: v_nop 1117; GFX10_W64-NEXT: s_or_b64 exec, exec, s[4:5] 1118; GFX10_W64-NEXT: s_and_b32 s0, 1, s6 1119; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 1120; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) 1121; GFX10_W64-NEXT: s_add_u32 s0, s2, 8 1122; GFX10_W64-NEXT: s_addc_u32 s1, s3, 0 1123; GFX10_W64-NEXT: s_waitcnt vmcnt(0) 1124; GFX10_W64-NEXT: v_div_fmas_f32 v2, v1, v2, v3 1125; GFX10_W64-NEXT: v_mov_b32_e32 v0, s0 1126; GFX10_W64-NEXT: v_mov_b32_e32 v1, s1 1127; GFX10_W64-NEXT: global_store_dword v[0:1], v2, off 1128; GFX10_W64-NEXT: s_endpgm 1129entry: 1130 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1131 %gep.out = getelementptr float, float addrspace(1)* %out, i32 2 1132 %gep.a = getelementptr float, float addrspace(1)* %in, i32 %tid 1133 %gep.b = getelementptr float, float addrspace(1)* %gep.a, i32 1 1134 %gep.c = getelementptr float, float addrspace(1)* %gep.a, i32 2 1135 1136 %a = load float, float addrspace(1)* %gep.a 1137 %b = load float, float addrspace(1)* %gep.b 1138 %c = load float, float addrspace(1)* %gep.c 1139 1140 %cmp0 = icmp eq i32 %tid, 0 1141 br i1 %cmp0, label %bb, label %exit 1142 1143bb: 1144 %val = load i32, i32 addrspace(1)* %dummy 1145 %cmp1 = icmp ne i32 %val, 0 1146 br label %exit 1147 1148exit: 1149 %cond = phi i1 [false, %entry], [%cmp1, %bb] 1150 %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %cond) 1151 store float %result, float addrspace(1)* %gep.out, align 4 1152 ret void 1153} 1154 1155declare i32 @llvm.amdgcn.workitem.id.x() #0 1156declare float @llvm.amdgcn.div.fmas.f32(float, float, float, i1) #0 1157declare double @llvm.amdgcn.div.fmas.f64(double, double, double, i1) #0 1158 1159attributes #0 = { nounwind readnone speculatable } 1160