1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; Denormal mode shouldn't matter for f16, check with and without flushing. 3; RUN: llc -global-isel -march=amdgcn -mcpu=tahiti -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6,GFX6-IEEE %s 4; RUN: llc -global-isel -march=amdgcn -mcpu=tahiti -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6,GFX6-FLUSH %s 5 6; RUN: llc -global-isel -march=amdgcn -mcpu=fiji -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX89,GFX8 %s 7; RUN: llc -global-isel -march=amdgcn -mcpu=fiji -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX89,GFX8 %s 8 9; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX89,GFX9 %s 10; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX89,GFX9 %s 11 12define half @v_fdiv_f16(half %a, half %b) { 13; GFX6-IEEE-LABEL: v_fdiv_f16: 14; GFX6-IEEE: ; %bb.0: 15; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 17; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 18; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 19; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 20; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 21; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 22; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 23; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 24; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 25; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 26; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 27; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 28; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0 29; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 30; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] 31; 32; GFX6-FLUSH-LABEL: v_fdiv_f16: 33; GFX6-FLUSH: ; %bb.0: 34; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 35; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 36; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 37; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 38; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 39; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 40; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 41; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 42; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 43; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 44; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 45; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 46; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 47; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 48; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 49; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, v0 50; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 51; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 52; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] 53; 54; GFX89-LABEL: v_fdiv_f16: 55; GFX89: ; %bb.0: 56; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 57; GFX89-NEXT: v_cvt_f32_f16_e32 v2, v1 58; GFX89-NEXT: v_cvt_f32_f16_e32 v3, v0 59; GFX89-NEXT: v_rcp_f32_e32 v2, v2 60; GFX89-NEXT: v_mul_f32_e32 v2, v3, v2 61; GFX89-NEXT: v_cvt_f16_f32_e32 v2, v2 62; GFX89-NEXT: v_div_fixup_f16 v0, v2, v1, v0 63; GFX89-NEXT: s_setpc_b64 s[30:31] 64 %fdiv = fdiv half %a, %b 65 ret half %fdiv 66} 67 68define half @v_fdiv_f16_afn(half %a, half %b) { 69; GFX6-LABEL: v_fdiv_f16_afn: 70; GFX6: ; %bb.0: 71; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 72; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 73; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 74; GFX6-NEXT: v_rcp_f32_e32 v1, v1 75; GFX6-NEXT: v_mul_f32_e32 v0, v0, v1 76; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 77; GFX6-NEXT: s_setpc_b64 s[30:31] 78; 79; GFX89-LABEL: v_fdiv_f16_afn: 80; GFX89: ; %bb.0: 81; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 82; GFX89-NEXT: v_rcp_f16_e32 v1, v1 83; GFX89-NEXT: v_mul_f16_e32 v0, v0, v1 84; GFX89-NEXT: s_setpc_b64 s[30:31] 85 %fdiv = fdiv afn half %a, %b 86 ret half %fdiv 87} 88 89define half @v_fdiv_f16_ulp25(half %a, half %b) { 90; GFX6-IEEE-LABEL: v_fdiv_f16_ulp25: 91; GFX6-IEEE: ; %bb.0: 92; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 93; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 94; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 95; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 96; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 97; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 98; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 99; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 100; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 101; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 102; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 103; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 104; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 105; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0 106; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 107; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] 108; 109; GFX6-FLUSH-LABEL: v_fdiv_f16_ulp25: 110; GFX6-FLUSH: ; %bb.0: 111; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 112; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 113; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 114; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 115; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 116; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 117; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 118; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 119; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 120; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 121; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 122; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 123; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 124; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 125; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 126; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, v0 127; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 128; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 129; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] 130; 131; GFX89-LABEL: v_fdiv_f16_ulp25: 132; GFX89: ; %bb.0: 133; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 134; GFX89-NEXT: v_cvt_f32_f16_e32 v2, v1 135; GFX89-NEXT: v_cvt_f32_f16_e32 v3, v0 136; GFX89-NEXT: v_rcp_f32_e32 v2, v2 137; GFX89-NEXT: v_mul_f32_e32 v2, v3, v2 138; GFX89-NEXT: v_cvt_f16_f32_e32 v2, v2 139; GFX89-NEXT: v_div_fixup_f16 v0, v2, v1, v0 140; GFX89-NEXT: s_setpc_b64 s[30:31] 141 %fdiv = fdiv half %a, %b, !fpmath !0 142 ret half %fdiv 143} 144 145define half @v_rcp_f16(half %x) { 146; GFX6-IEEE-LABEL: v_rcp_f16: 147; GFX6-IEEE: ; %bb.0: 148; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 149; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0 150; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 151; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 152; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 153; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 154; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 155; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 156; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 157; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 158; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 159; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 160; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 161; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1 162; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 163; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] 164; 165; GFX6-FLUSH-LABEL: v_rcp_f16: 166; GFX6-FLUSH: ; %bb.0: 167; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 168; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0 169; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 170; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 171; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 172; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 173; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 174; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 175; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 176; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 177; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 178; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 179; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 180; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 181; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 182; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1 183; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 184; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 185; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] 186; 187; GFX89-LABEL: v_rcp_f16: 188; GFX89: ; %bb.0: 189; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 190; GFX89-NEXT: v_cvt_f32_f16_e32 v1, v0 191; GFX89-NEXT: v_cvt_f32_f16_e32 v2, 1.0 192; GFX89-NEXT: v_rcp_f32_e32 v1, v1 193; GFX89-NEXT: v_mul_f32_e32 v1, v2, v1 194; GFX89-NEXT: v_cvt_f16_f32_e32 v1, v1 195; GFX89-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 196; GFX89-NEXT: s_setpc_b64 s[30:31] 197 %fdiv = fdiv half 1.0, %x 198 ret half %fdiv 199} 200 201define half @v_rcp_f16_arcp(half %x) { 202; GFX6-IEEE-LABEL: v_rcp_f16_arcp: 203; GFX6-IEEE: ; %bb.0: 204; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 205; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0 206; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 207; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 208; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 209; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 210; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 211; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 212; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 213; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 214; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 215; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 216; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 217; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1 218; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 219; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] 220; 221; GFX6-FLUSH-LABEL: v_rcp_f16_arcp: 222; GFX6-FLUSH: ; %bb.0: 223; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 224; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0 225; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 226; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 227; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 228; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 229; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 230; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 231; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 232; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 233; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 234; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 235; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 236; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 237; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 238; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1 239; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 240; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 241; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] 242; 243; GFX89-LABEL: v_rcp_f16_arcp: 244; GFX89: ; %bb.0: 245; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 246; GFX89-NEXT: v_cvt_f32_f16_e32 v1, v0 247; GFX89-NEXT: v_cvt_f32_f16_e32 v2, 1.0 248; GFX89-NEXT: v_rcp_f32_e32 v1, v1 249; GFX89-NEXT: v_mul_f32_e32 v1, v2, v1 250; GFX89-NEXT: v_cvt_f16_f32_e32 v1, v1 251; GFX89-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 252; GFX89-NEXT: s_setpc_b64 s[30:31] 253 %fdiv = fdiv arcp half 1.0, %x 254 ret half %fdiv 255} 256 257define half @v_rcp_f16_arcp_afn(half %x) { 258; GFX6-LABEL: v_rcp_f16_arcp_afn: 259; GFX6: ; %bb.0: 260; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 261; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 262; GFX6-NEXT: v_cvt_f32_f16_e32 v1, 1.0 263; GFX6-NEXT: v_rcp_f32_e32 v0, v0 264; GFX6-NEXT: v_mul_f32_e32 v0, v1, v0 265; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 266; GFX6-NEXT: s_setpc_b64 s[30:31] 267; 268; GFX89-LABEL: v_rcp_f16_arcp_afn: 269; GFX89: ; %bb.0: 270; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 271; GFX89-NEXT: v_rcp_f16_e32 v0, v0 272; GFX89-NEXT: s_setpc_b64 s[30:31] 273 %fdiv = fdiv arcp afn half 1.0, %x 274 ret half %fdiv 275} 276 277define half @v_rcp_f16_ulp25(half %x) { 278; GFX6-IEEE-LABEL: v_rcp_f16_ulp25: 279; GFX6-IEEE: ; %bb.0: 280; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 281; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0 282; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 283; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 284; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 285; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 286; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 287; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 288; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 289; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 290; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 291; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 292; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 293; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1 294; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 295; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] 296; 297; GFX6-FLUSH-LABEL: v_rcp_f16_ulp25: 298; GFX6-FLUSH: ; %bb.0: 299; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 300; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0 301; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 302; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 303; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 304; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 305; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 306; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 307; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 308; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 309; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 310; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 311; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 312; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 313; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 314; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1 315; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 316; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 317; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] 318; 319; GFX89-LABEL: v_rcp_f16_ulp25: 320; GFX89: ; %bb.0: 321; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 322; GFX89-NEXT: v_rcp_f16_e32 v0, v0 323; GFX89-NEXT: s_setpc_b64 s[30:31] 324 %fdiv = fdiv half 1.0, %x, !fpmath !0 325 ret half %fdiv 326} 327 328define half @v_fdiv_f16_afn_ulp25(half %a, half %b) { 329; GFX6-LABEL: v_fdiv_f16_afn_ulp25: 330; GFX6: ; %bb.0: 331; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 332; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 333; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 334; GFX6-NEXT: v_rcp_f32_e32 v1, v1 335; GFX6-NEXT: v_mul_f32_e32 v0, v0, v1 336; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 337; GFX6-NEXT: s_setpc_b64 s[30:31] 338; 339; GFX89-LABEL: v_fdiv_f16_afn_ulp25: 340; GFX89: ; %bb.0: 341; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 342; GFX89-NEXT: v_rcp_f16_e32 v1, v1 343; GFX89-NEXT: v_mul_f16_e32 v0, v0, v1 344; GFX89-NEXT: s_setpc_b64 s[30:31] 345 %fdiv = fdiv afn half %a, %b, !fpmath !0 346 ret half %fdiv 347} 348 349define half @v_fdiv_f16_arcp_ulp25(half %a, half %b) { 350; GFX6-IEEE-LABEL: v_fdiv_f16_arcp_ulp25: 351; GFX6-IEEE: ; %bb.0: 352; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 353; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 354; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 355; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 356; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 357; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 358; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 359; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 360; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 361; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 362; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 363; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 364; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 365; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0 366; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 367; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] 368; 369; GFX6-FLUSH-LABEL: v_fdiv_f16_arcp_ulp25: 370; GFX6-FLUSH: ; %bb.0: 371; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 372; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 373; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 374; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 375; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 376; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 377; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 378; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 379; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 380; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 381; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 382; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 383; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 384; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 385; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 386; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, v0 387; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 388; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 389; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] 390; 391; GFX89-LABEL: v_fdiv_f16_arcp_ulp25: 392; GFX89: ; %bb.0: 393; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 394; GFX89-NEXT: v_cvt_f32_f16_e32 v2, v1 395; GFX89-NEXT: v_cvt_f32_f16_e32 v3, v0 396; GFX89-NEXT: v_rcp_f32_e32 v2, v2 397; GFX89-NEXT: v_mul_f32_e32 v2, v3, v2 398; GFX89-NEXT: v_cvt_f16_f32_e32 v2, v2 399; GFX89-NEXT: v_div_fixup_f16 v0, v2, v1, v0 400; GFX89-NEXT: s_setpc_b64 s[30:31] 401 %fdiv = fdiv arcp half %a, %b, !fpmath !0 402 ret half %fdiv 403} 404 405define <2 x half> @v_fdiv_v2f16(<2 x half> %a, <2 x half> %b) { 406; GFX6-IEEE-LABEL: v_fdiv_v2f16: 407; GFX6-IEEE: ; %bb.0: 408; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 409; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 410; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v2 411; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 412; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v3 413; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 414; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v4 415; GFX6-IEEE-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0 416; GFX6-IEEE-NEXT: v_fma_f32 v7, -v4, v5, 1.0 417; GFX6-IEEE-NEXT: v_fma_f32 v5, v7, v5, v5 418; GFX6-IEEE-NEXT: v_mul_f32_e32 v7, v6, v5 419; GFX6-IEEE-NEXT: v_fma_f32 v8, -v4, v7, v6 420; GFX6-IEEE-NEXT: v_fma_f32 v7, v8, v5, v7 421; GFX6-IEEE-NEXT: v_fma_f32 v4, -v4, v7, v6 422; GFX6-IEEE-NEXT: v_div_fmas_f32 v4, v4, v5, v7 423; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0 424; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v3, v3, v1 425; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v2 426; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1 427; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 428; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v4, 1.0 429; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4 430; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4 431; GFX6-IEEE-NEXT: v_fma_f32 v7, -v2, v6, v5 432; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 433; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v6, v5 434; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v6 435; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v3, v1 436; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 437; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] 438; 439; GFX6-FLUSH-LABEL: v_fdiv_v2f16: 440; GFX6-FLUSH: ; %bb.0: 441; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 442; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 443; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v2 444; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 445; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v5, v4 446; GFX6-FLUSH-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0 447; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 448; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v4, v5, 1.0 449; GFX6-FLUSH-NEXT: v_fma_f32 v5, v7, v5, v5 450; GFX6-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v5 451; GFX6-FLUSH-NEXT: v_fma_f32 v8, -v4, v7, v6 452; GFX6-FLUSH-NEXT: v_fma_f32 v7, v8, v5, v7 453; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v4, v7, v6 454; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 455; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 456; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v3 457; GFX6-FLUSH-NEXT: v_div_fmas_f32 v4, v4, v5, v7 458; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v4, v2, v0 459; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 460; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 461; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v3, v3, v1 462; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 463; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1 464; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 465; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v4, 1.0 466; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4 467; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4 468; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5 469; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6 470; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5 471; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 472; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v6 473; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v3, v1 474; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 475; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] 476; 477; GFX8-LABEL: v_fdiv_v2f16: 478; GFX8: ; %bb.0: 479; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 480; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v1 481; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v1 482; GFX8-NEXT: v_cvt_f32_f16_e32 v5, v4 483; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0 484; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v0 485; GFX8-NEXT: v_rcp_f32_e32 v2, v2 486; GFX8-NEXT: v_cvt_f32_f16_e32 v7, v6 487; GFX8-NEXT: v_rcp_f32_e32 v5, v5 488; GFX8-NEXT: v_mul_f32_e32 v2, v3, v2 489; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2 490; GFX8-NEXT: v_mul_f32_e32 v3, v7, v5 491; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3 492; GFX8-NEXT: v_div_fixup_f16 v0, v2, v1, v0 493; GFX8-NEXT: v_mov_b32_e32 v2, 16 494; GFX8-NEXT: v_div_fixup_f16 v1, v3, v4, v6 495; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 496; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 497; GFX8-NEXT: s_setpc_b64 s[30:31] 498; 499; GFX9-LABEL: v_fdiv_v2f16: 500; GFX9: ; %bb.0: 501; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 502; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v1 503; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v1 504; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v4 505; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v0 506; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v0 507; GFX9-NEXT: v_rcp_f32_e32 v2, v2 508; GFX9-NEXT: v_cvt_f32_f16_e32 v7, v6 509; GFX9-NEXT: v_rcp_f32_e32 v5, v5 510; GFX9-NEXT: v_mul_f32_e32 v2, v3, v2 511; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2 512; GFX9-NEXT: v_mul_f32_e32 v3, v7, v5 513; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 514; GFX9-NEXT: v_div_fixup_f16 v0, v2, v1, v0 515; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff 516; GFX9-NEXT: v_div_fixup_f16 v1, v3, v4, v6 517; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 518; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v1 519; GFX9-NEXT: s_setpc_b64 s[30:31] 520 %fdiv = fdiv <2 x half> %a, %b 521 ret <2 x half> %fdiv 522} 523 524define <2 x half> @v_fdiv_v2f16_afn(<2 x half> %a, <2 x half> %b) { 525; GFX6-LABEL: v_fdiv_v2f16_afn: 526; GFX6: ; %bb.0: 527; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 528; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 529; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 530; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 531; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 532; GFX6-NEXT: v_rcp_f32_e32 v2, v2 533; GFX6-NEXT: v_rcp_f32_e32 v3, v3 534; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2 535; GFX6-NEXT: v_mul_f32_e32 v1, v1, v3 536; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 537; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 538; GFX6-NEXT: s_setpc_b64 s[30:31] 539; 540; GFX8-LABEL: v_fdiv_v2f16_afn: 541; GFX8: ; %bb.0: 542; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 543; GFX8-NEXT: v_rcp_f16_e32 v2, v1 544; GFX8-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 545; GFX8-NEXT: v_mul_f16_e32 v2, v0, v2 546; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 547; GFX8-NEXT: v_mov_b32_e32 v1, 16 548; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 549; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 550; GFX8-NEXT: s_setpc_b64 s[30:31] 551; 552; GFX9-LABEL: v_fdiv_v2f16_afn: 553; GFX9: ; %bb.0: 554; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 555; GFX9-NEXT: v_rcp_f16_e32 v2, v1 556; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 557; GFX9-NEXT: v_mul_f16_e32 v2, v0, v2 558; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 559; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff 560; GFX9-NEXT: v_and_or_b32 v0, v2, v1, v0 561; GFX9-NEXT: s_setpc_b64 s[30:31] 562 %fdiv = fdiv afn <2 x half> %a, %b 563 ret <2 x half> %fdiv 564} 565 566define <2 x half> @v_fdiv_v2f16_ulp25(<2 x half> %a, <2 x half> %b) { 567; GFX6-IEEE-LABEL: v_fdiv_v2f16_ulp25: 568; GFX6-IEEE: ; %bb.0: 569; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 570; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 571; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v2 572; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 573; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v3 574; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 575; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v4 576; GFX6-IEEE-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0 577; GFX6-IEEE-NEXT: v_fma_f32 v7, -v4, v5, 1.0 578; GFX6-IEEE-NEXT: v_fma_f32 v5, v7, v5, v5 579; GFX6-IEEE-NEXT: v_mul_f32_e32 v7, v6, v5 580; GFX6-IEEE-NEXT: v_fma_f32 v8, -v4, v7, v6 581; GFX6-IEEE-NEXT: v_fma_f32 v7, v8, v5, v7 582; GFX6-IEEE-NEXT: v_fma_f32 v4, -v4, v7, v6 583; GFX6-IEEE-NEXT: v_div_fmas_f32 v4, v4, v5, v7 584; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0 585; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v3, v3, v1 586; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v2 587; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1 588; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 589; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v4, 1.0 590; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4 591; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4 592; GFX6-IEEE-NEXT: v_fma_f32 v7, -v2, v6, v5 593; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 594; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v6, v5 595; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v6 596; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v3, v1 597; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 598; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] 599; 600; GFX6-FLUSH-LABEL: v_fdiv_v2f16_ulp25: 601; GFX6-FLUSH: ; %bb.0: 602; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 603; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 604; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v2 605; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 606; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v5, v4 607; GFX6-FLUSH-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0 608; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 609; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v4, v5, 1.0 610; GFX6-FLUSH-NEXT: v_fma_f32 v5, v7, v5, v5 611; GFX6-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v5 612; GFX6-FLUSH-NEXT: v_fma_f32 v8, -v4, v7, v6 613; GFX6-FLUSH-NEXT: v_fma_f32 v7, v8, v5, v7 614; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v4, v7, v6 615; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 616; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 617; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v3 618; GFX6-FLUSH-NEXT: v_div_fmas_f32 v4, v4, v5, v7 619; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v4, v2, v0 620; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 621; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 622; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v3, v3, v1 623; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 624; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1 625; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 626; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v4, 1.0 627; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4 628; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4 629; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5 630; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6 631; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5 632; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 633; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v6 634; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v3, v1 635; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 636; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] 637; 638; GFX8-LABEL: v_fdiv_v2f16_ulp25: 639; GFX8: ; %bb.0: 640; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 641; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v1 642; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v1 643; GFX8-NEXT: v_cvt_f32_f16_e32 v5, v4 644; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0 645; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v0 646; GFX8-NEXT: v_rcp_f32_e32 v2, v2 647; GFX8-NEXT: v_cvt_f32_f16_e32 v7, v6 648; GFX8-NEXT: v_rcp_f32_e32 v5, v5 649; GFX8-NEXT: v_mul_f32_e32 v2, v3, v2 650; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2 651; GFX8-NEXT: v_mul_f32_e32 v3, v7, v5 652; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3 653; GFX8-NEXT: v_div_fixup_f16 v0, v2, v1, v0 654; GFX8-NEXT: v_mov_b32_e32 v2, 16 655; GFX8-NEXT: v_div_fixup_f16 v1, v3, v4, v6 656; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 657; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 658; GFX8-NEXT: s_setpc_b64 s[30:31] 659; 660; GFX9-LABEL: v_fdiv_v2f16_ulp25: 661; GFX9: ; %bb.0: 662; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 663; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v1 664; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v1 665; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v4 666; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v0 667; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v0 668; GFX9-NEXT: v_rcp_f32_e32 v2, v2 669; GFX9-NEXT: v_cvt_f32_f16_e32 v7, v6 670; GFX9-NEXT: v_rcp_f32_e32 v5, v5 671; GFX9-NEXT: v_mul_f32_e32 v2, v3, v2 672; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2 673; GFX9-NEXT: v_mul_f32_e32 v3, v7, v5 674; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 675; GFX9-NEXT: v_div_fixup_f16 v0, v2, v1, v0 676; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff 677; GFX9-NEXT: v_div_fixup_f16 v1, v3, v4, v6 678; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 679; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v1 680; GFX9-NEXT: s_setpc_b64 s[30:31] 681 %fdiv = fdiv <2 x half> %a, %b, !fpmath !0 682 ret <2 x half> %fdiv 683} 684 685define <2 x half> @v_rcp_v2f16(<2 x half> %x) { 686; GFX6-IEEE-LABEL: v_rcp_v2f16: 687; GFX6-IEEE: ; %bb.0: 688; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 689; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, 1.0 690; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 691; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 692; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2 693; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 694; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2 695; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0 696; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4 697; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4 698; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5 699; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 700; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 701; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6 702; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2 703; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2 704; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 705; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v1, v2 706; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 707; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0 708; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4 709; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4 710; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5 711; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 712; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 713; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6 714; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2 715; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 716; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] 717; 718; GFX6-FLUSH-LABEL: v_rcp_v2f16: 719; GFX6-FLUSH: ; %bb.0: 720; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 721; GFX6-FLUSH-NEXT: s_movk_i32 s6, 0x3c00 722; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, s6 723; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 724; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2 725; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 726; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2 727; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 728; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0 729; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4 730; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4 731; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5 732; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6 733; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5 734; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 735; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6 736; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, s6 737; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 738; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2 739; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 740; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 741; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4 742; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 743; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4 744; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 745; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0 746; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3 747; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3 748; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5 749; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6 750; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5 751; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 752; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6 753; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4 754; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 755; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] 756; 757; GFX8-LABEL: v_rcp_v2f16: 758; GFX8: ; %bb.0: 759; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 760; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0 761; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v0 762; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v2 763; GFX8-NEXT: v_cvt_f32_f16_e32 v4, 1.0 764; GFX8-NEXT: v_rcp_f32_e32 v1, v1 765; GFX8-NEXT: v_rcp_f32_e32 v3, v3 766; GFX8-NEXT: v_mul_f32_e32 v1, v4, v1 767; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3 768; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1 769; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3 770; GFX8-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 771; GFX8-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0 772; GFX8-NEXT: v_mov_b32_e32 v2, 16 773; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 774; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 775; GFX8-NEXT: s_setpc_b64 s[30:31] 776; 777; GFX9-LABEL: v_rcp_v2f16: 778; GFX9: ; %bb.0: 779; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 780; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 781; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v0 782; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2 783; GFX9-NEXT: v_cvt_f32_f16_e32 v4, 1.0 784; GFX9-NEXT: v_rcp_f32_e32 v1, v1 785; GFX9-NEXT: v_rcp_f32_e32 v3, v3 786; GFX9-NEXT: v_mul_f32_e32 v1, v4, v1 787; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3 788; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1 789; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 790; GFX9-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 791; GFX9-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0 792; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff 793; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 794; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v1 795; GFX9-NEXT: s_setpc_b64 s[30:31] 796 %fdiv = fdiv <2 x half> <half 1.0, half 1.0>, %x 797 ret <2 x half> %fdiv 798} 799 800define <2 x half> @v_rcp_v2f16_arcp(<2 x half> %x) { 801; GFX6-IEEE-LABEL: v_rcp_v2f16_arcp: 802; GFX6-IEEE: ; %bb.0: 803; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 804; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, 1.0 805; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 806; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 807; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2 808; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 809; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2 810; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0 811; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4 812; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4 813; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5 814; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 815; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 816; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6 817; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2 818; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2 819; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 820; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v1, v2 821; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 822; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0 823; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4 824; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4 825; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5 826; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 827; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 828; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6 829; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2 830; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 831; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] 832; 833; GFX6-FLUSH-LABEL: v_rcp_v2f16_arcp: 834; GFX6-FLUSH: ; %bb.0: 835; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 836; GFX6-FLUSH-NEXT: s_movk_i32 s6, 0x3c00 837; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, s6 838; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 839; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2 840; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 841; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2 842; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 843; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0 844; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4 845; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4 846; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5 847; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6 848; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5 849; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 850; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6 851; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, s6 852; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 853; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2 854; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 855; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 856; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4 857; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 858; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4 859; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 860; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0 861; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3 862; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3 863; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5 864; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6 865; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5 866; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 867; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6 868; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4 869; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 870; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] 871; 872; GFX8-LABEL: v_rcp_v2f16_arcp: 873; GFX8: ; %bb.0: 874; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 875; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0 876; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v0 877; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v2 878; GFX8-NEXT: v_cvt_f32_f16_e32 v4, 1.0 879; GFX8-NEXT: v_rcp_f32_e32 v1, v1 880; GFX8-NEXT: v_rcp_f32_e32 v3, v3 881; GFX8-NEXT: v_mul_f32_e32 v1, v4, v1 882; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3 883; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1 884; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3 885; GFX8-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 886; GFX8-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0 887; GFX8-NEXT: v_mov_b32_e32 v2, 16 888; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 889; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 890; GFX8-NEXT: s_setpc_b64 s[30:31] 891; 892; GFX9-LABEL: v_rcp_v2f16_arcp: 893; GFX9: ; %bb.0: 894; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 895; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 896; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v0 897; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2 898; GFX9-NEXT: v_cvt_f32_f16_e32 v4, 1.0 899; GFX9-NEXT: v_rcp_f32_e32 v1, v1 900; GFX9-NEXT: v_rcp_f32_e32 v3, v3 901; GFX9-NEXT: v_mul_f32_e32 v1, v4, v1 902; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3 903; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1 904; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 905; GFX9-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 906; GFX9-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0 907; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff 908; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 909; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v1 910; GFX9-NEXT: s_setpc_b64 s[30:31] 911 %fdiv = fdiv arcp <2 x half> <half 1.0, half 1.0>, %x 912 ret <2 x half> %fdiv 913} 914 915define <2 x half> @v_rcp_v2f16_arcp_afn(<2 x half> %x) { 916; GFX6-LABEL: v_rcp_v2f16_arcp_afn: 917; GFX6: ; %bb.0: 918; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 919; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 920; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 921; GFX6-NEXT: v_cvt_f32_f16_e32 v2, 1.0 922; GFX6-NEXT: v_rcp_f32_e32 v0, v0 923; GFX6-NEXT: v_rcp_f32_e32 v1, v1 924; GFX6-NEXT: v_mul_f32_e32 v0, v2, v0 925; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1 926; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 927; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 928; GFX6-NEXT: s_setpc_b64 s[30:31] 929; 930; GFX8-LABEL: v_rcp_v2f16_arcp_afn: 931; GFX8: ; %bb.0: 932; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 933; GFX8-NEXT: v_rcp_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 934; GFX8-NEXT: v_rcp_f16_e32 v0, v0 935; GFX8-NEXT: v_mov_b32_e32 v2, 16 936; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 937; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 938; GFX8-NEXT: s_setpc_b64 s[30:31] 939; 940; GFX9-LABEL: v_rcp_v2f16_arcp_afn: 941; GFX9: ; %bb.0: 942; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 943; GFX9-NEXT: v_rcp_f16_e32 v1, v0 944; GFX9-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 945; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff 946; GFX9-NEXT: v_and_or_b32 v0, v1, v2, v0 947; GFX9-NEXT: s_setpc_b64 s[30:31] 948 %fdiv = fdiv arcp afn <2 x half> <half 1.0, half 1.0>, %x 949 ret <2 x half> %fdiv 950} 951 952define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) { 953; GFX6-IEEE-LABEL: v_rcp_v2f16_ulp25: 954; GFX6-IEEE: ; %bb.0: 955; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 956; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, 1.0 957; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 958; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 959; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2 960; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 961; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2 962; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0 963; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4 964; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4 965; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5 966; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 967; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 968; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6 969; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2 970; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2 971; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 972; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v1, v2 973; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 974; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0 975; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4 976; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4 977; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5 978; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 979; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 980; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6 981; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2 982; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 983; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] 984; 985; GFX6-FLUSH-LABEL: v_rcp_v2f16_ulp25: 986; GFX6-FLUSH: ; %bb.0: 987; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 988; GFX6-FLUSH-NEXT: s_movk_i32 s6, 0x3c00 989; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, s6 990; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 991; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2 992; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 993; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2 994; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 995; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0 996; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4 997; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4 998; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5 999; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6 1000; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5 1001; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 1002; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6 1003; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, s6 1004; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 1005; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2 1006; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 1007; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 1008; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4 1009; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 1010; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4 1011; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 1012; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0 1013; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3 1014; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3 1015; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5 1016; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6 1017; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5 1018; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 1019; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6 1020; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4 1021; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 1022; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] 1023; 1024; GFX8-LABEL: v_rcp_v2f16_ulp25: 1025; GFX8: ; %bb.0: 1026; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1027; GFX8-NEXT: v_rcp_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1028; GFX8-NEXT: v_rcp_f16_e32 v0, v0 1029; GFX8-NEXT: v_mov_b32_e32 v2, 16 1030; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 1031; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1032; GFX8-NEXT: s_setpc_b64 s[30:31] 1033; 1034; GFX9-LABEL: v_rcp_v2f16_ulp25: 1035; GFX9: ; %bb.0: 1036; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1037; GFX9-NEXT: v_rcp_f16_e32 v1, v0 1038; GFX9-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 1039; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff 1040; GFX9-NEXT: v_and_or_b32 v0, v1, v2, v0 1041; GFX9-NEXT: s_setpc_b64 s[30:31] 1042 %fdiv = fdiv <2 x half> <half 1.0, half 1.0>, %x, !fpmath !0 1043 ret <2 x half> %fdiv 1044} 1045 1046define <2 x half> @v_fdiv_v2f16_afn_ulp25(<2 x half> %a, <2 x half> %b) { 1047; GFX6-LABEL: v_fdiv_v2f16_afn_ulp25: 1048; GFX6: ; %bb.0: 1049; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1050; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 1051; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 1052; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 1053; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 1054; GFX6-NEXT: v_rcp_f32_e32 v2, v2 1055; GFX6-NEXT: v_rcp_f32_e32 v3, v3 1056; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2 1057; GFX6-NEXT: v_mul_f32_e32 v1, v1, v3 1058; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 1059; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 1060; GFX6-NEXT: s_setpc_b64 s[30:31] 1061; 1062; GFX8-LABEL: v_fdiv_v2f16_afn_ulp25: 1063; GFX8: ; %bb.0: 1064; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1065; GFX8-NEXT: v_rcp_f16_e32 v2, v1 1066; GFX8-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1067; GFX8-NEXT: v_mul_f16_e32 v2, v0, v2 1068; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 1069; GFX8-NEXT: v_mov_b32_e32 v1, 16 1070; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 1071; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1072; GFX8-NEXT: s_setpc_b64 s[30:31] 1073; 1074; GFX9-LABEL: v_fdiv_v2f16_afn_ulp25: 1075; GFX9: ; %bb.0: 1076; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1077; GFX9-NEXT: v_rcp_f16_e32 v2, v1 1078; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1079; GFX9-NEXT: v_mul_f16_e32 v2, v0, v2 1080; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 1081; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff 1082; GFX9-NEXT: v_and_or_b32 v0, v2, v1, v0 1083; GFX9-NEXT: s_setpc_b64 s[30:31] 1084 %fdiv = fdiv afn <2 x half> %a, %b, !fpmath !0 1085 ret <2 x half> %fdiv 1086} 1087 1088define <2 x half> @v_fdiv_v2f16_arcp_ulp25(<2 x half> %a, <2 x half> %b) { 1089; GFX6-IEEE-LABEL: v_fdiv_v2f16_arcp_ulp25: 1090; GFX6-IEEE: ; %bb.0: 1091; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1092; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 1093; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v2 1094; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 1095; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v3 1096; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 1097; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v4 1098; GFX6-IEEE-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0 1099; GFX6-IEEE-NEXT: v_fma_f32 v7, -v4, v5, 1.0 1100; GFX6-IEEE-NEXT: v_fma_f32 v5, v7, v5, v5 1101; GFX6-IEEE-NEXT: v_mul_f32_e32 v7, v6, v5 1102; GFX6-IEEE-NEXT: v_fma_f32 v8, -v4, v7, v6 1103; GFX6-IEEE-NEXT: v_fma_f32 v7, v8, v5, v7 1104; GFX6-IEEE-NEXT: v_fma_f32 v4, -v4, v7, v6 1105; GFX6-IEEE-NEXT: v_div_fmas_f32 v4, v4, v5, v7 1106; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0 1107; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v3, v3, v1 1108; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v2 1109; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1 1110; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 1111; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v4, 1.0 1112; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4 1113; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4 1114; GFX6-IEEE-NEXT: v_fma_f32 v7, -v2, v6, v5 1115; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 1116; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v6, v5 1117; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v6 1118; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v3, v1 1119; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 1120; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] 1121; 1122; GFX6-FLUSH-LABEL: v_fdiv_v2f16_arcp_ulp25: 1123; GFX6-FLUSH: ; %bb.0: 1124; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1125; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 1126; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v2 1127; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 1128; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v5, v4 1129; GFX6-FLUSH-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0 1130; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 1131; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v4, v5, 1.0 1132; GFX6-FLUSH-NEXT: v_fma_f32 v5, v7, v5, v5 1133; GFX6-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v5 1134; GFX6-FLUSH-NEXT: v_fma_f32 v8, -v4, v7, v6 1135; GFX6-FLUSH-NEXT: v_fma_f32 v7, v8, v5, v7 1136; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v4, v7, v6 1137; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 1138; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 1139; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v3 1140; GFX6-FLUSH-NEXT: v_div_fmas_f32 v4, v4, v5, v7 1141; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v4, v2, v0 1142; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 1143; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 1144; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v3, v3, v1 1145; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 1146; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1 1147; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 1148; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v4, 1.0 1149; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4 1150; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4 1151; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5 1152; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6 1153; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5 1154; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 1155; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v6 1156; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v3, v1 1157; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 1158; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] 1159; 1160; GFX8-LABEL: v_fdiv_v2f16_arcp_ulp25: 1161; GFX8: ; %bb.0: 1162; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1163; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v1 1164; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v1 1165; GFX8-NEXT: v_cvt_f32_f16_e32 v5, v4 1166; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0 1167; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v0 1168; GFX8-NEXT: v_rcp_f32_e32 v2, v2 1169; GFX8-NEXT: v_cvt_f32_f16_e32 v7, v6 1170; GFX8-NEXT: v_rcp_f32_e32 v5, v5 1171; GFX8-NEXT: v_mul_f32_e32 v2, v3, v2 1172; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2 1173; GFX8-NEXT: v_mul_f32_e32 v3, v7, v5 1174; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3 1175; GFX8-NEXT: v_div_fixup_f16 v0, v2, v1, v0 1176; GFX8-NEXT: v_mov_b32_e32 v2, 16 1177; GFX8-NEXT: v_div_fixup_f16 v1, v3, v4, v6 1178; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 1179; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1180; GFX8-NEXT: s_setpc_b64 s[30:31] 1181; 1182; GFX9-LABEL: v_fdiv_v2f16_arcp_ulp25: 1183; GFX9: ; %bb.0: 1184; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1185; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v1 1186; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v1 1187; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v4 1188; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v0 1189; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v0 1190; GFX9-NEXT: v_rcp_f32_e32 v2, v2 1191; GFX9-NEXT: v_cvt_f32_f16_e32 v7, v6 1192; GFX9-NEXT: v_rcp_f32_e32 v5, v5 1193; GFX9-NEXT: v_mul_f32_e32 v2, v3, v2 1194; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2 1195; GFX9-NEXT: v_mul_f32_e32 v3, v7, v5 1196; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 1197; GFX9-NEXT: v_div_fixup_f16 v0, v2, v1, v0 1198; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff 1199; GFX9-NEXT: v_div_fixup_f16 v1, v3, v4, v6 1200; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 1201; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v1 1202; GFX9-NEXT: s_setpc_b64 s[30:31] 1203 %fdiv = fdiv arcp <2 x half> %a, %b, !fpmath !0 1204 ret <2 x half> %fdiv 1205} 1206 1207define <2 x half> @v_fdiv_v2f16_arcp_afn_ulp25(<2 x half> %a, <2 x half> %b) { 1208; GFX6-LABEL: v_fdiv_v2f16_arcp_afn_ulp25: 1209; GFX6: ; %bb.0: 1210; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1211; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 1212; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 1213; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 1214; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 1215; GFX6-NEXT: v_rcp_f32_e32 v2, v2 1216; GFX6-NEXT: v_rcp_f32_e32 v3, v3 1217; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2 1218; GFX6-NEXT: v_mul_f32_e32 v1, v1, v3 1219; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 1220; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 1221; GFX6-NEXT: s_setpc_b64 s[30:31] 1222; 1223; GFX8-LABEL: v_fdiv_v2f16_arcp_afn_ulp25: 1224; GFX8: ; %bb.0: 1225; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1226; GFX8-NEXT: v_rcp_f16_e32 v2, v1 1227; GFX8-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1228; GFX8-NEXT: v_mul_f16_e32 v2, v0, v2 1229; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 1230; GFX8-NEXT: v_mov_b32_e32 v1, 16 1231; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 1232; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1233; GFX8-NEXT: s_setpc_b64 s[30:31] 1234; 1235; GFX9-LABEL: v_fdiv_v2f16_arcp_afn_ulp25: 1236; GFX9: ; %bb.0: 1237; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1238; GFX9-NEXT: v_rcp_f16_e32 v2, v1 1239; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1240; GFX9-NEXT: v_mul_f16_e32 v2, v0, v2 1241; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 1242; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff 1243; GFX9-NEXT: v_and_or_b32 v0, v2, v1, v0 1244; GFX9-NEXT: s_setpc_b64 s[30:31] 1245 %fdiv = fdiv afn arcp <2 x half> %a, %b, !fpmath !0 1246 ret <2 x half> %fdiv 1247} 1248 1249!0 = !{float 2.500000e+00} 1250