1; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs -enable-misched=false < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX906 %s 2; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -enable-misched=false < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX900 %s 3; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -enable-misched=false < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI %s 4; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs -enable-misched=false < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,CI %s 5 6; GCN-LABEL: mixlo_simple: 7; GCN: s_waitcnt 8; GFX900-NEXT: v_mad_mixlo_f16 v0, v0, v1, v2{{$}} 9; GFX906-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2{{$}} 10; GFX9-NEXT: s_setpc_b64 11 12; CIVI: v_mac_f32_e32 13; CIVI: v_cvt_f16_f32_e32 14define half @mixlo_simple(float %src0, float %src1, float %src2) #0 { 15 %result = call float @llvm.fmuladd.f32(float %src0, float %src1, float %src2) 16 %cvt.result = fptrunc float %result to half 17 ret half %cvt.result 18} 19 20; GCN-LABEL: {{^}}v_mad_mixlo_f16_f16lo_f16lo_f16lo: 21; GFX900: v_mad_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]{{$}} 22; GFX906: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]{{$}} 23; CI: v_mac_f32 24; CIVI: v_cvt_f16_f32 25define half @v_mad_mixlo_f16_f16lo_f16lo_f16lo(half %src0, half %src1, half %src2) #0 { 26 %src0.ext = fpext half %src0 to float 27 %src1.ext = fpext half %src1 to float 28 %src2.ext = fpext half %src2 to float 29 %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext) 30 %cvt.result = fptrunc float %result to half 31 ret half %cvt.result 32} 33 34; GCN-LABEL: {{^}}v_mad_mixlo_f16_f16lo_f16lo_f32: 35; GCN: s_waitcnt 36; GFX900-NEXT: v_mad_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0]{{$}} 37; GFX906-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0]{{$}} 38; GFX9-NEXT: s_setpc_b64 39 40; CIVI: v_mac_f32 41define half @v_mad_mixlo_f16_f16lo_f16lo_f32(half %src0, half %src1, float %src2) #0 { 42 %src0.ext = fpext half %src0 to float 43 %src1.ext = fpext half %src1 to float 44 %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2) 45 %cvt.result = fptrunc float %result to half 46 ret half %cvt.result 47} 48 49; GCN-LABEL: {{^}}v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt: 50; GCN: s_waitcnt 51; GFX900-NEXT: v_mad_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp{{$}} 52; GFX906-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp{{$}} 53; GFX9-NEXT: s_setpc_b64 54 55; CIVI: v_mac_f32_e32 v{{[0-9]}}, v{{[0-9]}}, v{{[0-9]$}} 56define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt(half %src0, half %src1, float %src2) #0 { 57 %src0.ext = fpext half %src0 to float 58 %src1.ext = fpext half %src1 to float 59 %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2) 60 %cvt.result = fptrunc float %result to half 61 %max = call half @llvm.maxnum.f16(half %cvt.result, half 0.0) 62 %clamp = call half @llvm.minnum.f16(half %max, half 1.0) 63 ret half %clamp 64} 65 66; GCN-LABEL: {{^}}v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt: 67; GCN: s_waitcnt 68; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp{{$}} 69; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp{{$}} 70; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 71; GFX9-NEXT: s_setpc_b64 72 73; CIVI: v_mad_f32 v{{[0-9]}}, v{{[0-9]}}, v{{[0-9]}}, v{{[0-9]}} clamp{{$}} 74define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt(half %src0, half %src1, float %src2) #0 { 75 %src0.ext = fpext half %src0 to float 76 %src1.ext = fpext half %src1 to float 77 %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2) 78 %max = call float @llvm.maxnum.f32(float %result, float 0.0) 79 %clamp = call float @llvm.minnum.f32(float %max, float 1.0) 80 %cvt.result = fptrunc float %clamp to half 81 ret half %cvt.result 82} 83 84; FIXME: Should abe able to avoid extra register because first 85; operation only clobbers relevant lane. 86; GCN-LABEL: {{^}}v_mad_mix_v2f32: 87; GCN: s_waitcnt 88 89; GFX900-NEXT: v_mad_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]{{$}} 90; GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]{{$}} 91 92; GFX906-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]{{$}} 93; GFX906-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]{{$}} 94 95; GFX9-NEXT: v_mov_b32_e32 v0, v3 96; GFX9-NEXT: s_setpc_b64 97define <2 x half> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 { 98 %src0.ext = fpext <2 x half> %src0 to <2 x float> 99 %src1.ext = fpext <2 x half> %src1 to <2 x float> 100 %src2.ext = fpext <2 x half> %src2 to <2 x float> 101 %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> %src2.ext) 102 %cvt.result = fptrunc <2 x float> %result to <2 x half> 103 ret <2 x half> %cvt.result 104} 105 106; GCN-LABEL: {{^}}v_mad_mix_v3f32: 107; GCN: s_waitcnt 108; GFX900-NEXT: v_mad_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1] 109; GFX900-NEXT: v_mad_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] 110; GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] 111 112; GFX906-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1] 113; GFX906-NEXT: v_fma_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] 114; GFX906-NEXT: v_fma_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] 115 116; GFX9-NEXT: v_mov_b32_e32 v0, v3 117; GFX9-NEXT: s_setpc_b64 118define <3 x half> @v_mad_mix_v3f32(<3 x half> %src0, <3 x half> %src1, <3 x half> %src2) #0 { 119 %src0.ext = fpext <3 x half> %src0 to <3 x float> 120 %src1.ext = fpext <3 x half> %src1 to <3 x float> 121 %src2.ext = fpext <3 x half> %src2 to <3 x float> 122 %result = tail call <3 x float> @llvm.fmuladd.v3f32(<3 x float> %src0.ext, <3 x float> %src1.ext, <3 x float> %src2.ext) 123 %cvt.result = fptrunc <3 x float> %result to <3 x half> 124 ret <3 x half> %cvt.result 125} 126 127; GCN-LABEL: {{^}}v_mad_mix_v4f32: 128; GCN: s_waitcnt 129; GFX900-NEXT: v_mad_mixlo_f16 v6, v1, v3, v5 op_sel_hi:[1,1,1] 130; GFX900-NEXT: v_mad_mixlo_f16 v7, v0, v2, v4 op_sel_hi:[1,1,1] 131; GFX900-NEXT: v_mad_mixhi_f16 v7, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] 132; GFX900-NEXT: v_mad_mixhi_f16 v6, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] 133 134; GFX906-NEXT: v_fma_mixlo_f16 v6, v1, v3, v5 op_sel_hi:[1,1,1] 135; GFX906-NEXT: v_fma_mixlo_f16 v7, v0, v2, v4 op_sel_hi:[1,1,1] 136; GFX906-NEXT: v_fma_mixhi_f16 v7, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] 137; GFX906-NEXT: v_fma_mixhi_f16 v6, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] 138 139; GFX9-NEXT: v_mov_b32_e32 v0, v7 140; GFX9-NEXT: v_mov_b32_e32 v1, v6 141; GFX9-NEXT: s_setpc_b64 142define <4 x half> @v_mad_mix_v4f32(<4 x half> %src0, <4 x half> %src1, <4 x half> %src2) #0 { 143 %src0.ext = fpext <4 x half> %src0 to <4 x float> 144 %src1.ext = fpext <4 x half> %src1 to <4 x float> 145 %src2.ext = fpext <4 x half> %src2 to <4 x float> 146 %result = tail call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %src0.ext, <4 x float> %src1.ext, <4 x float> %src2.ext) 147 %cvt.result = fptrunc <4 x float> %result to <4 x half> 148 ret <4 x half> %cvt.result 149} 150 151; FIXME: Fold clamp 152; GCN-LABEL: {{^}}v_mad_mix_v2f32_clamp_postcvt: 153; GFX900: v_mad_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp{{$}} 154; GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp{{$}} 155 156; GFX906: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp{{$}} 157; GFX906-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp{{$}} 158 159; GFX9-NEXT: v_mov_b32_e32 v0, v3 160; GFX9-NEXT: s_setpc_b64 161define <2 x half> @v_mad_mix_v2f32_clamp_postcvt(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 { 162 %src0.ext = fpext <2 x half> %src0 to <2 x float> 163 %src1.ext = fpext <2 x half> %src1 to <2 x float> 164 %src2.ext = fpext <2 x half> %src2 to <2 x float> 165 %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> %src2.ext) 166 %cvt.result = fptrunc <2 x float> %result to <2 x half> 167 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %cvt.result, <2 x half> zeroinitializer) 168 %clamp = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>) 169 ret <2 x half> %clamp 170} 171 172; FIXME: Should be packed into 2 registers per argument? 173; GCN-LABEL: {{^}}v_mad_mix_v3f32_clamp_postcvt: 174; GCN: s_waitcnt 175; GFX900-DAG: v_mad_mixlo_f16 v{{[0-9]+}}, v0, v2, v4 op_sel_hi:[1,1,1] clamp 176; GFX900-DAG: v_mad_mixhi_f16 v{{[0-9]+}}, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp 177; GFX900-DAG: v_mad_mixlo_f16 v{{[0-9]+}}, v1, v3, v5 op_sel_hi:[1,1,1] 178 179; GFX906-DAG: v_fma_mixlo_f16 v{{[0-9]+}}, v0, v2, v4 op_sel_hi:[1,1,1] clamp 180; GFX906-DAG: v_fma_mixhi_f16 v{{[0-9]+}}, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp 181; GFX906-DAG: v_fma_mixlo_f16 v{{[0-9]+}}, v1, v3, v5 op_sel_hi:[1,1,1] 182 183 184; GFX9-DAG: v_pk_max_f16 v1, v1, v1 clamp 185; GFX9: v_mov_b32_e32 v0, v{{[0-9]+}} 186; GFX9-NEXT: s_setpc_b64 187define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %src1, <3 x half> %src2) #0 { 188 %src0.ext = fpext <3 x half> %src0 to <3 x float> 189 %src1.ext = fpext <3 x half> %src1 to <3 x float> 190 %src2.ext = fpext <3 x half> %src2 to <3 x float> 191 %result = tail call <3 x float> @llvm.fmuladd.v3f32(<3 x float> %src0.ext, <3 x float> %src1.ext, <3 x float> %src2.ext) 192 %cvt.result = fptrunc <3 x float> %result to <3 x half> 193 %max = call <3 x half> @llvm.maxnum.v3f16(<3 x half> %cvt.result, <3 x half> zeroinitializer) 194 %clamp = call <3 x half> @llvm.minnum.v3f16(<3 x half> %max, <3 x half> <half 1.0, half 1.0, half 1.0>) 195 ret <3 x half> %clamp 196} 197 198; GCN-LABEL: {{^}}v_mad_mix_v4f32_clamp_postcvt: 199; GCN: s_waitcnt 200; GFX900-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp 201; GFX900-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp 202; GFX900-NEXT: v_mad_mixlo_f16 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp 203; GFX900-NEXT: v_mad_mixhi_f16 v2, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp 204 205 206; GFX906-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp 207; GFX906-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp 208; GFX906-NEXT: v_fma_mixlo_f16 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp 209; GFX906-NEXT: v_fma_mixhi_f16 v2, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp 210 211 212; GFX9-NEXT: v_mov_b32_e32 v0, v6 213; GFX9-NEXT: v_mov_b32_e32 v1, v2 214; GFX9-NEXT: s_setpc_b64 215define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %src1, <4 x half> %src2) #0 { 216 %src0.ext = fpext <4 x half> %src0 to <4 x float> 217 %src1.ext = fpext <4 x half> %src1 to <4 x float> 218 %src2.ext = fpext <4 x half> %src2 to <4 x float> 219 %result = tail call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %src0.ext, <4 x float> %src1.ext, <4 x float> %src2.ext) 220 %cvt.result = fptrunc <4 x float> %result to <4 x half> 221 %max = call <4 x half> @llvm.maxnum.v4f16(<4 x half> %cvt.result, <4 x half> zeroinitializer) 222 %clamp = call <4 x half> @llvm.minnum.v4f16(<4 x half> %max, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>) 223 ret <4 x half> %clamp 224} 225 226; GCN-LABEL: {{^}}v_mad_mix_v2f32_clamp_postcvt_lo: 227; GCN: s_waitcnt 228; GFX900-NEXT: v_mad_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp 229; GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] 230 231; GFX906-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp 232; GFX906-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] 233 234; GFX9-NEXT: v_mov_b32_e32 v0, v3 235; GFX9-NEXT: s_setpc_b64 236define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_lo(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 { 237 %src0.ext = fpext <2 x half> %src0 to <2 x float> 238 %src1.ext = fpext <2 x half> %src1 to <2 x float> 239 %src2.ext = fpext <2 x half> %src2 to <2 x float> 240 %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> %src2.ext) 241 %cvt.result = fptrunc <2 x float> %result to <2 x half> 242 %cvt.lo = extractelement <2 x half> %cvt.result, i32 0 243 %max.lo = call half @llvm.maxnum.f16(half %cvt.lo, half 0.0) 244 %clamp.lo = call half @llvm.minnum.f16(half %max.lo, half 1.0) 245 %insert = insertelement <2 x half> %cvt.result, half %clamp.lo, i32 0 246 ret <2 x half> %insert 247} 248 249; GCN-LABEL: {{^}}v_mad_mix_v2f32_clamp_postcvt_hi: 250; GCN: s_waitcnt 251; GFX900-NEXT: v_mad_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] 252; GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp 253 254; GFX906-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] 255; GFX906-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp 256 257; GFX9-NEXT: v_mov_b32_e32 v0, v3 258; GFX9-NEXT: s_setpc_b64 259define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_hi(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 { 260 %src0.ext = fpext <2 x half> %src0 to <2 x float> 261 %src1.ext = fpext <2 x half> %src1 to <2 x float> 262 %src2.ext = fpext <2 x half> %src2 to <2 x float> 263 %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> %src2.ext) 264 %cvt.result = fptrunc <2 x float> %result to <2 x half> 265 %cvt.hi = extractelement <2 x half> %cvt.result, i32 1 266 %max.hi = call half @llvm.maxnum.f16(half %cvt.hi, half 0.0) 267 %clamp.hi = call half @llvm.minnum.f16(half %max.hi, half 1.0) 268 %insert = insertelement <2 x half> %cvt.result, half %clamp.hi, i32 1 269 ret <2 x half> %insert 270} 271 272; FIXME: Should be able to use mixlo/mixhi 273; GCN-LABEL: {{^}}v_mad_mix_v2f32_clamp_precvt: 274; GFX900: v_mad_mix_f32 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp 275; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp 276 277; GFX906: v_fma_mix_f32 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp 278; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp 279 280; GFX9: v_cvt_f16_f32_e32 v1, v3 281; GFX9: v_cvt_f16_f32_e32 v0, v0 282; GFX9: v_pack_b32_f16 v0, v0, v1 283; GFX9: s_setpc_b64 284define <2 x half> @v_mad_mix_v2f32_clamp_precvt(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 { 285 %src0.ext = fpext <2 x half> %src0 to <2 x float> 286 %src1.ext = fpext <2 x half> %src1 to <2 x float> 287 %src2.ext = fpext <2 x half> %src2 to <2 x float> 288 %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> %src2.ext) 289 %max = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %result, <2 x float> zeroinitializer) 290 %clamp = call <2 x float> @llvm.minnum.v2f32(<2 x float> %max, <2 x float> <float 1.0, float 1.0>) 291 %cvt.result = fptrunc <2 x float> %clamp to <2 x half> 292 ret <2 x half> %cvt.result 293} 294 295; FIXME: Handling undef 4th component 296; GCN-LABEL: {{^}}v_mad_mix_v3f32_clamp_precvt: 297; GCN: s_waitcnt 298; GFX900-NEXT: v_mad_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp 299; GFX900-NEXT: v_mad_mix_f32 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp 300; GFX900-NEXT: v_mad_mix_f32 v0, v0, v2, v4 op_sel_hi:[1,1,1] clamp 301 302; GFX906-NEXT: v_fma_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp 303; GFX906-NEXT: v_fma_mix_f32 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp 304; GFX906-NEXT: v_fma_mix_f32 v0, v0, v2, v4 op_sel_hi:[1,1,1] clamp 305 306 307; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v3 308; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 309; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1 310; GFX9-NEXT: v_pack_b32_f16 v0, v0, v2 311; GFX9-NEXT: s_setpc_b64 312define <3 x half> @v_mad_mix_v3f32_clamp_precvt(<3 x half> %src0, <3 x half> %src1, <3 x half> %src2) #0 { 313 %src0.ext = fpext <3 x half> %src0 to <3 x float> 314 %src1.ext = fpext <3 x half> %src1 to <3 x float> 315 %src2.ext = fpext <3 x half> %src2 to <3 x float> 316 %result = tail call <3 x float> @llvm.fmuladd.v3f32(<3 x float> %src0.ext, <3 x float> %src1.ext, <3 x float> %src2.ext) 317 %max = call <3 x float> @llvm.maxnum.v3f32(<3 x float> %result, <3 x float> zeroinitializer) 318 %clamp = call <3 x float> @llvm.minnum.v3f32(<3 x float> %max, <3 x float> <float 1.0, float 1.0, float 1.0>) 319 %cvt.result = fptrunc <3 x float> %clamp to <3 x half> 320 ret <3 x half> %cvt.result 321} 322 323; GCN-LABEL: {{^}}v_mad_mix_v4f32_clamp_precvt: 324; GFX900: v_mad_mix_f32 v6, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp 325; GFX900: v_mad_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp 326; GFX900: v_mad_mix_f32 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp 327; GFX900: v_mad_mix_f32 v0, v0, v2, v4 op_sel_hi:[1,1,1] clamp 328 329 330; GFX906: v_fma_mix_f32 v6, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp 331; GFX906: v_fma_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp 332; GFX906: v_fma_mix_f32 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp 333; GFX906: v_fma_mix_f32 v0, v0, v2, v4 op_sel_hi:[1,1,1] clamp 334 335; GFX9: v_cvt_f16_f32 336; GFX9: v_cvt_f16_f32 337; GFX9: v_cvt_f16_f32 338; GFX9: v_cvt_f16_f32 339define <4 x half> @v_mad_mix_v4f32_clamp_precvt(<4 x half> %src0, <4 x half> %src1, <4 x half> %src2) #0 { 340 %src0.ext = fpext <4 x half> %src0 to <4 x float> 341 %src1.ext = fpext <4 x half> %src1 to <4 x float> 342 %src2.ext = fpext <4 x half> %src2 to <4 x float> 343 %result = tail call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %src0.ext, <4 x float> %src1.ext, <4 x float> %src2.ext) 344 %max = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %result, <4 x float> zeroinitializer) 345 %clamp = call <4 x float> @llvm.minnum.v4f32(<4 x float> %max, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>) 346 %cvt.result = fptrunc <4 x float> %clamp to <4 x half> 347 ret <4 x half> %cvt.result 348} 349 350declare half @llvm.minnum.f16(half, half) #1 351declare <2 x half> @llvm.minnum.v2f16(<2 x half>, <2 x half>) #1 352declare <3 x half> @llvm.minnum.v3f16(<3 x half>, <3 x half>) #1 353declare <4 x half> @llvm.minnum.v4f16(<4 x half>, <4 x half>) #1 354 355declare half @llvm.maxnum.f16(half, half) #1 356declare <2 x half> @llvm.maxnum.v2f16(<2 x half>, <2 x half>) #1 357declare <3 x half> @llvm.maxnum.v3f16(<3 x half>, <3 x half>) #1 358declare <4 x half> @llvm.maxnum.v4f16(<4 x half>, <4 x half>) #1 359 360declare float @llvm.minnum.f32(float, float) #1 361declare <2 x float> @llvm.minnum.v2f32(<2 x float>, <2 x float>) #1 362declare <3 x float> @llvm.minnum.v3f32(<3 x float>, <3 x float>) #1 363declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>) #1 364 365declare float @llvm.maxnum.f32(float, float) #1 366declare <2 x float> @llvm.maxnum.v2f32(<2 x float>, <2 x float>) #1 367declare <3 x float> @llvm.maxnum.v3f32(<3 x float>, <3 x float>) #1 368declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>) #1 369 370declare float @llvm.fmuladd.f32(float, float, float) #1 371declare <2 x float> @llvm.fmuladd.v2f32(<2 x float>, <2 x float>, <2 x float>) #1 372declare <3 x float> @llvm.fmuladd.v3f32(<3 x float>, <3 x float>, <3 x float>) #1 373declare <4 x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>) #1 374 375attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } 376attributes #1 = { nounwind readnone speculatable } 377