1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s 3; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s 4; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s 5; FIXME: promotion not handled without f16 insts 6 7define half @v_constained_fmul_f16_fpexcept_strict(half %x, half %y) #0 { 8; GCN-LABEL: v_constained_fmul_f16_fpexcept_strict: 9; GCN: ; %bb.0: 10; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11; GCN-NEXT: v_mul_f16_e32 v0, v0, v1 12; GCN-NEXT: s_setpc_b64 s[30:31] 13; 14; GFX10-LABEL: v_constained_fmul_f16_fpexcept_strict: 15; GFX10: ; %bb.0: 16; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 18; GFX10-NEXT: v_mul_f16_e32 v0, v0, v1 19; GFX10-NEXT: s_setpc_b64 s[30:31] 20 %val = call half @llvm.experimental.constrained.fmul.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.strict") 21 ret half %val 22} 23 24define half @v_constained_fmul_f16_fpexcept_ignore(half %x, half %y) #0 { 25; GCN-LABEL: v_constained_fmul_f16_fpexcept_ignore: 26; GCN: ; %bb.0: 27; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 28; GCN-NEXT: v_mul_f16_e32 v0, v0, v1 29; GCN-NEXT: s_setpc_b64 s[30:31] 30; 31; GFX10-LABEL: v_constained_fmul_f16_fpexcept_ignore: 32; GFX10: ; %bb.0: 33; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 34; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 35; GFX10-NEXT: v_mul_f16_e32 v0, v0, v1 36; GFX10-NEXT: s_setpc_b64 s[30:31] 37 %val = call half @llvm.experimental.constrained.fmul.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.ignore") 38 ret half %val 39} 40 41define half @v_constained_fmul_f16_fpexcept_maytrap(half %x, half %y) #0 { 42; GCN-LABEL: v_constained_fmul_f16_fpexcept_maytrap: 43; GCN: ; %bb.0: 44; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 45; GCN-NEXT: v_mul_f16_e32 v0, v0, v1 46; GCN-NEXT: s_setpc_b64 s[30:31] 47; 48; GFX10-LABEL: v_constained_fmul_f16_fpexcept_maytrap: 49; GFX10: ; %bb.0: 50; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 51; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 52; GFX10-NEXT: v_mul_f16_e32 v0, v0, v1 53; GFX10-NEXT: s_setpc_b64 s[30:31] 54 %val = call half @llvm.experimental.constrained.fmul.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap") 55 ret half %val 56} 57 58define <2 x half> @v_constained_fmul_v2f16_fpexcept_strict(<2 x half> %x, <2 x half> %y) #0 { 59; GFX9-LABEL: v_constained_fmul_v2f16_fpexcept_strict: 60; GFX9: ; %bb.0: 61; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 62; GFX9-NEXT: v_pk_mul_f16 v0, v0, v1 63; GFX9-NEXT: s_setpc_b64 s[30:31] 64; 65; GFX8-LABEL: v_constained_fmul_v2f16_fpexcept_strict: 66; GFX8: ; %bb.0: 67; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 68; GFX8-NEXT: v_mul_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 69; GFX8-NEXT: v_mul_f16_e32 v0, v0, v1 70; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 71; GFX8-NEXT: s_setpc_b64 s[30:31] 72; 73; GFX10-LABEL: v_constained_fmul_v2f16_fpexcept_strict: 74; GFX10: ; %bb.0: 75; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 76; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 77; GFX10-NEXT: v_pk_mul_f16 v0, v0, v1 78; GFX10-NEXT: s_setpc_b64 s[30:31] 79 %val = call <2 x half> @llvm.experimental.constrained.fmul.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") 80 ret <2 x half> %val 81} 82 83define <2 x half> @v_constained_fmul_v2f16_fpexcept_ignore(<2 x half> %x, <2 x half> %y) #0 { 84; GFX9-LABEL: v_constained_fmul_v2f16_fpexcept_ignore: 85; GFX9: ; %bb.0: 86; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 87; GFX9-NEXT: v_pk_mul_f16 v0, v0, v1 88; GFX9-NEXT: s_setpc_b64 s[30:31] 89; 90; GFX8-LABEL: v_constained_fmul_v2f16_fpexcept_ignore: 91; GFX8: ; %bb.0: 92; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 93; GFX8-NEXT: v_mul_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 94; GFX8-NEXT: v_mul_f16_e32 v0, v0, v1 95; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 96; GFX8-NEXT: s_setpc_b64 s[30:31] 97; 98; GFX10-LABEL: v_constained_fmul_v2f16_fpexcept_ignore: 99; GFX10: ; %bb.0: 100; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 101; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 102; GFX10-NEXT: v_pk_mul_f16 v0, v0, v1 103; GFX10-NEXT: s_setpc_b64 s[30:31] 104 %val = call <2 x half> @llvm.experimental.constrained.fmul.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.ignore") 105 ret <2 x half> %val 106} 107 108define <2 x half> @v_constained_fmul_v2f16_fpexcept_maytrap(<2 x half> %x, <2 x half> %y) #0 { 109; GFX9-LABEL: v_constained_fmul_v2f16_fpexcept_maytrap: 110; GFX9: ; %bb.0: 111; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 112; GFX9-NEXT: v_pk_mul_f16 v0, v0, v1 113; GFX9-NEXT: s_setpc_b64 s[30:31] 114; 115; GFX8-LABEL: v_constained_fmul_v2f16_fpexcept_maytrap: 116; GFX8: ; %bb.0: 117; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 118; GFX8-NEXT: v_mul_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 119; GFX8-NEXT: v_mul_f16_e32 v0, v0, v1 120; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 121; GFX8-NEXT: s_setpc_b64 s[30:31] 122; 123; GFX10-LABEL: v_constained_fmul_v2f16_fpexcept_maytrap: 124; GFX10: ; %bb.0: 125; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 126; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 127; GFX10-NEXT: v_pk_mul_f16 v0, v0, v1 128; GFX10-NEXT: s_setpc_b64 s[30:31] 129 %val = call <2 x half> @llvm.experimental.constrained.fmul.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap") 130 ret <2 x half> %val 131} 132 133define <3 x half> @v_constained_fmul_v3f16_fpexcept_strict(<3 x half> %x, <3 x half> %y) #0 { 134; GFX9-LABEL: v_constained_fmul_v3f16_fpexcept_strict: 135; GFX9: ; %bb.0: 136; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 137; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2 138; GFX9-NEXT: v_mul_f16_e32 v1, v1, v3 139; GFX9-NEXT: s_setpc_b64 s[30:31] 140; 141; GFX8-LABEL: v_constained_fmul_v3f16_fpexcept_strict: 142; GFX8: ; %bb.0: 143; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 144; GFX8-NEXT: v_mul_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 145; GFX8-NEXT: v_mul_f16_e32 v0, v0, v2 146; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 147; GFX8-NEXT: v_mul_f16_e32 v1, v1, v3 148; GFX8-NEXT: s_setpc_b64 s[30:31] 149; 150; GFX10-LABEL: v_constained_fmul_v3f16_fpexcept_strict: 151; GFX10: ; %bb.0: 152; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 153; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 154; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2 155; GFX10-NEXT: v_mul_f16_e32 v1, v1, v3 156; GFX10-NEXT: s_setpc_b64 s[30:31] 157 %val = call <3 x half> @llvm.experimental.constrained.fmul.v3f16(<3 x half> %x, <3 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") 158 ret <3 x half> %val 159} 160 161; FIXME: Scalarized 162define <4 x half> @v_constained_fmul_v4f16_fpexcept_strict(<4 x half> %x, <4 x half> %y) #0 { 163; GFX9-LABEL: v_constained_fmul_v4f16_fpexcept_strict: 164; GFX9: ; %bb.0: 165; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 166; GFX9-NEXT: v_mul_f16_sdwa v4, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 167; GFX9-NEXT: v_mul_f16_e32 v1, v1, v3 168; GFX9-NEXT: v_mul_f16_sdwa v5, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 169; GFX9-NEXT: v_mul_f16_e32 v0, v0, v2 170; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v0 171; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v1 172; GFX9-NEXT: s_setpc_b64 s[30:31] 173; 174; GFX8-LABEL: v_constained_fmul_v4f16_fpexcept_strict: 175; GFX8: ; %bb.0: 176; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 177; GFX8-NEXT: v_mul_f16_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 178; GFX8-NEXT: v_mul_f16_e32 v1, v1, v3 179; GFX8-NEXT: v_mul_f16_sdwa v5, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 180; GFX8-NEXT: v_mul_f16_e32 v0, v0, v2 181; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 182; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 183; GFX8-NEXT: s_setpc_b64 s[30:31] 184; 185; GFX10-LABEL: v_constained_fmul_v4f16_fpexcept_strict: 186; GFX10: ; %bb.0: 187; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 188; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 189; GFX10-NEXT: v_mul_f16_e32 v4, v0, v2 190; GFX10-NEXT: v_mul_f16_e32 v6, v1, v3 191; GFX10-NEXT: v_mov_b32_e32 v5, 0xffff 192; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 193; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 194; GFX10-NEXT: v_and_b32_e32 v2, v5, v4 195; GFX10-NEXT: v_and_b32_e32 v3, v5, v6 196; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v2 197; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v3 198; GFX10-NEXT: s_setpc_b64 s[30:31] 199 %val = call <4 x half> @llvm.experimental.constrained.fmul.v4f16(<4 x half> %x, <4 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") 200 ret <4 x half> %val 201} 202 203define amdgpu_ps half @s_constained_fmul_f16_fpexcept_strict(half inreg %x, half inreg %y) #0 { 204; GCN-LABEL: s_constained_fmul_f16_fpexcept_strict: 205; GCN: ; %bb.0: 206; GCN-NEXT: v_mov_b32_e32 v0, s3 207; GCN-NEXT: v_mul_f16_e32 v0, s2, v0 208; GCN-NEXT: ; return to shader part epilog 209; 210; GFX10-LABEL: s_constained_fmul_f16_fpexcept_strict: 211; GFX10: ; %bb.0: 212; GFX10-NEXT: v_mul_f16_e64 v0, s2, s3 213; GFX10-NEXT: ; return to shader part epilog 214 %val = call half @llvm.experimental.constrained.fmul.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.strict") 215 ret half %val 216} 217 218define amdgpu_ps <2 x half> @s_constained_fmul_v2f16_fpexcept_strict(<2 x half> inreg %x, <2 x half> inreg %y) #0 { 219; GFX9-LABEL: s_constained_fmul_v2f16_fpexcept_strict: 220; GFX9: ; %bb.0: 221; GFX9-NEXT: v_mov_b32_e32 v0, s3 222; GFX9-NEXT: v_pk_mul_f16 v0, s2, v0 223; GFX9-NEXT: ; return to shader part epilog 224; 225; GFX8-LABEL: s_constained_fmul_v2f16_fpexcept_strict: 226; GFX8: ; %bb.0: 227; GFX8-NEXT: s_lshr_b32 s0, s3, 16 228; GFX8-NEXT: s_lshr_b32 s1, s2, 16 229; GFX8-NEXT: v_mov_b32_e32 v0, s0 230; GFX8-NEXT: v_mov_b32_e32 v1, s1 231; GFX8-NEXT: v_mul_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 232; GFX8-NEXT: v_mov_b32_e32 v1, s3 233; GFX8-NEXT: v_mul_f16_e32 v1, s2, v1 234; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 235; GFX8-NEXT: ; return to shader part epilog 236; 237; GFX10-LABEL: s_constained_fmul_v2f16_fpexcept_strict: 238; GFX10: ; %bb.0: 239; GFX10-NEXT: v_pk_mul_f16 v0, s2, s3 240; GFX10-NEXT: ; return to shader part epilog 241 %val = call <2 x half> @llvm.experimental.constrained.fmul.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") 242 ret <2 x half> %val 243} 244 245declare half @llvm.experimental.constrained.fmul.f16(half, half, metadata, metadata) #1 246declare <2 x half> @llvm.experimental.constrained.fmul.v2f16(<2 x half>, <2 x half>, metadata, metadata) #1 247declare <3 x half> @llvm.experimental.constrained.fmul.v3f16(<3 x half>, <3 x half>, metadata, metadata) #1 248declare <4 x half> @llvm.experimental.constrained.fmul.v4f16(<4 x half>, <4 x half>, metadata, metadata) #1 249 250attributes #0 = { strictfp } 251attributes #1 = { inaccessiblememonly nounwind willreturn } 252