1; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs -enable-misched=false < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX906 %s
2; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -enable-misched=false < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX900 %s
3; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -enable-misched=false < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI %s
4; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs -enable-misched=false < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,CI %s
5
6; GCN-LABEL: mixlo_simple:
7; GCN: s_waitcnt
8; GFX900-NEXT: v_mad_mixlo_f16 v0, v0, v1, v2{{$}}
9; GFX906-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2{{$}}
10; GFX9-NEXT: s_setpc_b64
11
12; CIVI: v_mac_f32_e32
13; CIVI: v_cvt_f16_f32_e32
14define half @mixlo_simple(float %src0, float %src1, float %src2) #0 {
15  %result = call float @llvm.fmuladd.f32(float %src0, float %src1, float %src2)
16  %cvt.result = fptrunc float %result to half
17  ret half %cvt.result
18}
19
20; GCN-LABEL: {{^}}v_mad_mixlo_f16_f16lo_f16lo_f16lo:
21; GFX900: v_mad_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]{{$}}
22; GFX906: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]{{$}}
23; CI: v_mac_f32
24; CIVI: v_cvt_f16_f32
25define half @v_mad_mixlo_f16_f16lo_f16lo_f16lo(half %src0, half %src1, half %src2) #0 {
26  %src0.ext = fpext half %src0 to float
27  %src1.ext = fpext half %src1 to float
28  %src2.ext = fpext half %src2 to float
29  %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
30  %cvt.result = fptrunc float %result to half
31  ret half %cvt.result
32}
33
34; GCN-LABEL: {{^}}v_mad_mixlo_f16_f16lo_f16lo_f32:
35; GCN: s_waitcnt
36; GFX900-NEXT: v_mad_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0]{{$}}
37; GFX906-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0]{{$}}
38; GFX9-NEXT: s_setpc_b64
39
40; CIVI: v_mac_f32
41define half @v_mad_mixlo_f16_f16lo_f16lo_f32(half %src0, half %src1, float %src2) #0 {
42  %src0.ext = fpext half %src0 to float
43  %src1.ext = fpext half %src1 to float
44  %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2)
45  %cvt.result = fptrunc float %result to half
46  ret half %cvt.result
47}
48
49; GCN-LABEL: {{^}}v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt:
50; GCN: s_waitcnt
51; GFX900-NEXT: v_mad_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp{{$}}
52; GFX906-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp{{$}}
53; GFX9-NEXT: s_setpc_b64
54
55; CIVI: v_mac_f32_e32 v{{[0-9]}}, v{{[0-9]}}, v{{[0-9]$}}
56define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt(half %src0, half %src1, float %src2) #0 {
57  %src0.ext = fpext half %src0 to float
58  %src1.ext = fpext half %src1 to float
59  %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2)
60  %cvt.result = fptrunc float %result to half
61  %max = call half @llvm.maxnum.f16(half %cvt.result, half 0.0)
62  %clamp = call half @llvm.minnum.f16(half %max, half 1.0)
63  ret half %clamp
64}
65
66; GCN-LABEL: {{^}}v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt:
67; GCN: s_waitcnt
68; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp{{$}}
69; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp{{$}}
70; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
71; GFX9-NEXT: s_setpc_b64
72
73; CIVI: v_mad_f32 v{{[0-9]}}, v{{[0-9]}}, v{{[0-9]}}, v{{[0-9]}} clamp{{$}}
74define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt(half %src0, half %src1, float %src2) #0 {
75  %src0.ext = fpext half %src0 to float
76  %src1.ext = fpext half %src1 to float
77  %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2)
78  %max = call float @llvm.maxnum.f32(float %result, float 0.0)
79  %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
80  %cvt.result = fptrunc float %clamp to half
81  ret half %cvt.result
82}
83
84; FIXME: Should abe able to avoid extra register because first
85; operation only clobbers relevant lane.
86; GCN-LABEL: {{^}}v_mad_mix_v2f32:
87; GCN: s_waitcnt
88
89; GFX900-NEXT: v_mad_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]{{$}}
90; GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]{{$}}
91
92; GFX906-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]{{$}}
93; GFX906-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]{{$}}
94
95; GFX9-NEXT: v_mov_b32_e32 v0, v3
96; GFX9-NEXT: s_setpc_b64
97define <2 x half> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 {
98  %src0.ext = fpext <2 x half> %src0 to <2 x float>
99  %src1.ext = fpext <2 x half> %src1 to <2 x float>
100  %src2.ext = fpext <2 x half> %src2 to <2 x float>
101  %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> %src2.ext)
102  %cvt.result = fptrunc <2 x float> %result to <2 x half>
103  ret <2 x half> %cvt.result
104}
105
106; GCN-LABEL: {{^}}v_mad_mix_v3f32:
107; GCN: s_waitcnt
108; GFX900-NEXT: v_mad_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
109; GFX900-NEXT: v_mad_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1]
110; GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
111
112; GFX906-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
113; GFX906-NEXT: v_fma_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1]
114; GFX906-NEXT: v_fma_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
115
116; GFX9-NEXT: v_mov_b32_e32 v0, v3
117; GFX9-NEXT: s_setpc_b64
118define <3 x half> @v_mad_mix_v3f32(<3 x half> %src0, <3 x half> %src1, <3 x half> %src2) #0 {
119  %src0.ext = fpext <3 x half> %src0 to <3 x float>
120  %src1.ext = fpext <3 x half> %src1 to <3 x float>
121  %src2.ext = fpext <3 x half> %src2 to <3 x float>
122  %result = tail call <3 x float> @llvm.fmuladd.v3f32(<3 x float> %src0.ext, <3 x float> %src1.ext, <3 x float> %src2.ext)
123  %cvt.result = fptrunc <3 x float> %result to <3 x half>
124  ret <3 x half> %cvt.result
125}
126
127; GCN-LABEL: {{^}}v_mad_mix_v4f32:
128; GCN: s_waitcnt
129; GFX900-NEXT: v_mad_mixlo_f16 v6, v1, v3, v5 op_sel_hi:[1,1,1]
130; GFX900-NEXT: v_mad_mixlo_f16 v7, v0, v2, v4 op_sel_hi:[1,1,1]
131; GFX900-NEXT: v_mad_mixhi_f16 v7, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
132; GFX900-NEXT: v_mad_mixhi_f16 v6, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1]
133
134; GFX906-NEXT: v_fma_mixlo_f16 v6, v1, v3, v5 op_sel_hi:[1,1,1]
135; GFX906-NEXT: v_fma_mixlo_f16 v7, v0, v2, v4 op_sel_hi:[1,1,1]
136; GFX906-NEXT: v_fma_mixhi_f16 v7, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
137; GFX906-NEXT: v_fma_mixhi_f16 v6, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1]
138
139; GFX9-NEXT: v_mov_b32_e32 v0, v7
140; GFX9-NEXT: v_mov_b32_e32 v1, v6
141; GFX9-NEXT: s_setpc_b64
142define <4 x half> @v_mad_mix_v4f32(<4 x half> %src0, <4 x half> %src1, <4 x half> %src2) #0 {
143  %src0.ext = fpext <4 x half> %src0 to <4 x float>
144  %src1.ext = fpext <4 x half> %src1 to <4 x float>
145  %src2.ext = fpext <4 x half> %src2 to <4 x float>
146  %result = tail call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %src0.ext, <4 x float> %src1.ext, <4 x float> %src2.ext)
147  %cvt.result = fptrunc <4 x float> %result to <4 x half>
148  ret <4 x half> %cvt.result
149}
150
151; FIXME: Fold clamp
152; GCN-LABEL: {{^}}v_mad_mix_v2f32_clamp_postcvt:
153; GFX900: v_mad_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp{{$}}
154; GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp{{$}}
155
156; GFX906: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp{{$}}
157; GFX906-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp{{$}}
158
159; GFX9-NEXT: v_mov_b32_e32 v0, v3
160; GFX9-NEXT: s_setpc_b64
161define <2 x half> @v_mad_mix_v2f32_clamp_postcvt(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 {
162  %src0.ext = fpext <2 x half> %src0 to <2 x float>
163  %src1.ext = fpext <2 x half> %src1 to <2 x float>
164  %src2.ext = fpext <2 x half> %src2 to <2 x float>
165  %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> %src2.ext)
166  %cvt.result = fptrunc <2 x float> %result to <2 x half>
167  %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %cvt.result, <2 x half> zeroinitializer)
168  %clamp = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
169  ret <2 x half> %clamp
170}
171
172; FIXME: Should be packed into 2 registers per argument?
173; GCN-LABEL: {{^}}v_mad_mix_v3f32_clamp_postcvt:
174; GCN: s_waitcnt
175; GFX900-DAG: v_mad_mixlo_f16 v{{[0-9]+}}, v0, v2, v4 op_sel_hi:[1,1,1] clamp
176; GFX900-DAG: v_mad_mixhi_f16 v{{[0-9]+}}, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
177; GFX900-DAG: v_mad_mixlo_f16 v{{[0-9]+}}, v1, v3, v5 op_sel_hi:[1,1,1]
178
179; GFX906-DAG: v_fma_mixlo_f16 v{{[0-9]+}}, v0, v2, v4 op_sel_hi:[1,1,1] clamp
180; GFX906-DAG: v_fma_mixhi_f16 v{{[0-9]+}}, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
181; GFX906-DAG: v_fma_mixlo_f16 v{{[0-9]+}}, v1, v3, v5 op_sel_hi:[1,1,1]
182
183
184; GFX9-DAG: v_pk_max_f16 v1, v1, v1 clamp
185; GFX9: v_mov_b32_e32 v0, v{{[0-9]+}}
186; GFX9-NEXT: s_setpc_b64
187define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %src1, <3 x half> %src2) #0 {
188  %src0.ext = fpext <3 x half> %src0 to <3 x float>
189  %src1.ext = fpext <3 x half> %src1 to <3 x float>
190  %src2.ext = fpext <3 x half> %src2 to <3 x float>
191  %result = tail call <3 x float> @llvm.fmuladd.v3f32(<3 x float> %src0.ext, <3 x float> %src1.ext, <3 x float> %src2.ext)
192  %cvt.result = fptrunc <3 x float> %result to <3 x half>
193  %max = call <3 x half> @llvm.maxnum.v3f16(<3 x half> %cvt.result, <3 x half> zeroinitializer)
194  %clamp = call <3 x half> @llvm.minnum.v3f16(<3 x half> %max, <3 x half> <half 1.0, half 1.0, half 1.0>)
195  ret <3 x half> %clamp
196}
197
198; GCN-LABEL: {{^}}v_mad_mix_v4f32_clamp_postcvt:
199; GCN: s_waitcnt
200; GFX900-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
201; GFX900-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
202; GFX900-NEXT: v_mad_mixlo_f16 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp
203; GFX900-NEXT: v_mad_mixhi_f16 v2, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
204
205
206; GFX906-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
207; GFX906-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
208; GFX906-NEXT: v_fma_mixlo_f16 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp
209; GFX906-NEXT: v_fma_mixhi_f16 v2, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
210
211
212; GFX9-NEXT: v_mov_b32_e32 v0, v6
213; GFX9-NEXT: v_mov_b32_e32 v1, v2
214; GFX9-NEXT: s_setpc_b64
215define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %src1, <4 x half> %src2) #0 {
216  %src0.ext = fpext <4 x half> %src0 to <4 x float>
217  %src1.ext = fpext <4 x half> %src1 to <4 x float>
218  %src2.ext = fpext <4 x half> %src2 to <4 x float>
219  %result = tail call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %src0.ext, <4 x float> %src1.ext, <4 x float> %src2.ext)
220  %cvt.result = fptrunc <4 x float> %result to <4 x half>
221  %max = call <4 x half> @llvm.maxnum.v4f16(<4 x half> %cvt.result, <4 x half> zeroinitializer)
222  %clamp = call <4 x half> @llvm.minnum.v4f16(<4 x half> %max, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>)
223  ret <4 x half> %clamp
224}
225
226; GCN-LABEL: {{^}}v_mad_mix_v2f32_clamp_postcvt_lo:
227; GCN: s_waitcnt
228; GFX900-NEXT: v_mad_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp
229; GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
230
231; GFX906-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp
232; GFX906-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
233
234; GFX9-NEXT: v_mov_b32_e32 v0, v3
235; GFX9-NEXT: s_setpc_b64
236define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_lo(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 {
237  %src0.ext = fpext <2 x half> %src0 to <2 x float>
238  %src1.ext = fpext <2 x half> %src1 to <2 x float>
239  %src2.ext = fpext <2 x half> %src2 to <2 x float>
240  %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> %src2.ext)
241  %cvt.result = fptrunc <2 x float> %result to <2 x half>
242  %cvt.lo = extractelement <2 x half> %cvt.result, i32 0
243  %max.lo = call half @llvm.maxnum.f16(half %cvt.lo, half 0.0)
244  %clamp.lo = call half @llvm.minnum.f16(half %max.lo, half 1.0)
245  %insert = insertelement <2 x half> %cvt.result, half %clamp.lo, i32 0
246  ret <2 x half> %insert
247}
248
249; GCN-LABEL: {{^}}v_mad_mix_v2f32_clamp_postcvt_hi:
250; GCN: s_waitcnt
251; GFX900-NEXT: v_mad_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
252; GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
253
254; GFX906-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
255; GFX906-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
256
257; GFX9-NEXT: v_mov_b32_e32 v0, v3
258; GFX9-NEXT: s_setpc_b64
259define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_hi(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 {
260  %src0.ext = fpext <2 x half> %src0 to <2 x float>
261  %src1.ext = fpext <2 x half> %src1 to <2 x float>
262  %src2.ext = fpext <2 x half> %src2 to <2 x float>
263  %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> %src2.ext)
264  %cvt.result = fptrunc <2 x float> %result to <2 x half>
265  %cvt.hi = extractelement <2 x half> %cvt.result, i32 1
266  %max.hi = call half @llvm.maxnum.f16(half %cvt.hi, half 0.0)
267  %clamp.hi = call half @llvm.minnum.f16(half %max.hi, half 1.0)
268  %insert = insertelement <2 x half> %cvt.result, half %clamp.hi, i32 1
269  ret <2 x half> %insert
270}
271
272; FIXME: Should be able to use mixlo/mixhi
273; GCN-LABEL: {{^}}v_mad_mix_v2f32_clamp_precvt:
274; GFX900: v_mad_mix_f32 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
275; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
276
277; GFX906: v_fma_mix_f32 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
278; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
279
280; GFX9: v_cvt_f16_f32_e32 v1, v3
281; GFX9: v_cvt_f16_f32_e32 v0, v0
282; GFX9: v_pack_b32_f16 v0, v0, v1
283; GFX9: s_setpc_b64
284define <2 x half> @v_mad_mix_v2f32_clamp_precvt(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 {
285  %src0.ext = fpext <2 x half> %src0 to <2 x float>
286  %src1.ext = fpext <2 x half> %src1 to <2 x float>
287  %src2.ext = fpext <2 x half> %src2 to <2 x float>
288  %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> %src2.ext)
289  %max = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %result, <2 x float> zeroinitializer)
290  %clamp = call <2 x float> @llvm.minnum.v2f32(<2 x float> %max, <2 x float> <float 1.0, float 1.0>)
291  %cvt.result = fptrunc <2 x float> %clamp to <2 x half>
292  ret <2 x half> %cvt.result
293}
294
295; FIXME: Handling undef 4th component
296; GCN-LABEL: {{^}}v_mad_mix_v3f32_clamp_precvt:
297; GCN: s_waitcnt
298; GFX900-NEXT: v_mad_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp
299; GFX900-NEXT: v_mad_mix_f32 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
300; GFX900-NEXT: v_mad_mix_f32 v0, v0, v2, v4 op_sel_hi:[1,1,1] clamp
301
302; GFX906-NEXT: v_fma_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp
303; GFX906-NEXT: v_fma_mix_f32 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
304; GFX906-NEXT: v_fma_mix_f32 v0, v0, v2, v4 op_sel_hi:[1,1,1] clamp
305
306
307; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v3
308; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
309; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
310; GFX9-NEXT: v_pack_b32_f16 v0, v0, v2
311; GFX9-NEXT: s_setpc_b64
312define <3 x half> @v_mad_mix_v3f32_clamp_precvt(<3 x half> %src0, <3 x half> %src1, <3 x half> %src2) #0 {
313  %src0.ext = fpext <3 x half> %src0 to <3 x float>
314  %src1.ext = fpext <3 x half> %src1 to <3 x float>
315  %src2.ext = fpext <3 x half> %src2 to <3 x float>
316  %result = tail call <3 x float> @llvm.fmuladd.v3f32(<3 x float> %src0.ext, <3 x float> %src1.ext, <3 x float> %src2.ext)
317  %max = call <3 x float> @llvm.maxnum.v3f32(<3 x float> %result, <3 x float> zeroinitializer)
318  %clamp = call <3 x float> @llvm.minnum.v3f32(<3 x float> %max, <3 x float> <float 1.0, float 1.0, float 1.0>)
319  %cvt.result = fptrunc <3 x float> %clamp to <3 x half>
320  ret <3 x half> %cvt.result
321}
322
323; GCN-LABEL: {{^}}v_mad_mix_v4f32_clamp_precvt:
324; GFX900: v_mad_mix_f32 v6, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
325; GFX900: v_mad_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp
326; GFX900: v_mad_mix_f32 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
327; GFX900: v_mad_mix_f32 v0, v0, v2, v4 op_sel_hi:[1,1,1] clamp
328
329
330; GFX906: v_fma_mix_f32 v6, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
331; GFX906: v_fma_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp
332; GFX906: v_fma_mix_f32 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
333; GFX906: v_fma_mix_f32 v0, v0, v2, v4 op_sel_hi:[1,1,1] clamp
334
335; GFX9: v_cvt_f16_f32
336; GFX9: v_cvt_f16_f32
337; GFX9: v_cvt_f16_f32
338; GFX9: v_cvt_f16_f32
339define <4 x half> @v_mad_mix_v4f32_clamp_precvt(<4 x half> %src0, <4 x half> %src1, <4 x half> %src2) #0 {
340  %src0.ext = fpext <4 x half> %src0 to <4 x float>
341  %src1.ext = fpext <4 x half> %src1 to <4 x float>
342  %src2.ext = fpext <4 x half> %src2 to <4 x float>
343  %result = tail call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %src0.ext, <4 x float> %src1.ext, <4 x float> %src2.ext)
344  %max = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %result, <4 x float> zeroinitializer)
345  %clamp = call <4 x float> @llvm.minnum.v4f32(<4 x float> %max, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
346  %cvt.result = fptrunc <4 x float> %clamp to <4 x half>
347  ret <4 x half> %cvt.result
348}
349
350declare half @llvm.minnum.f16(half, half) #1
351declare <2 x half> @llvm.minnum.v2f16(<2 x half>, <2 x half>) #1
352declare <3 x half> @llvm.minnum.v3f16(<3 x half>, <3 x half>) #1
353declare <4 x half> @llvm.minnum.v4f16(<4 x half>, <4 x half>) #1
354
355declare half @llvm.maxnum.f16(half, half) #1
356declare <2 x half> @llvm.maxnum.v2f16(<2 x half>, <2 x half>) #1
357declare <3 x half> @llvm.maxnum.v3f16(<3 x half>, <3 x half>) #1
358declare <4 x half> @llvm.maxnum.v4f16(<4 x half>, <4 x half>) #1
359
360declare float @llvm.minnum.f32(float, float) #1
361declare <2 x float> @llvm.minnum.v2f32(<2 x float>, <2 x float>) #1
362declare <3 x float> @llvm.minnum.v3f32(<3 x float>, <3 x float>) #1
363declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>) #1
364
365declare float @llvm.maxnum.f32(float, float) #1
366declare <2 x float> @llvm.maxnum.v2f32(<2 x float>, <2 x float>) #1
367declare <3 x float> @llvm.maxnum.v3f32(<3 x float>, <3 x float>) #1
368declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>) #1
369
370declare float @llvm.fmuladd.f32(float, float, float) #1
371declare <2 x float> @llvm.fmuladd.v2f32(<2 x float>, <2 x float>, <2 x float>) #1
372declare <3 x float> @llvm.fmuladd.v3f32(<3 x float>, <3 x float>, <3 x float>) #1
373declare <4 x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>) #1
374
375attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
376attributes #1 = { nounwind readnone speculatable }
377