1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; Denormal mode shouldn't matter for f16, check with and without flushing.
3; RUN: llc -global-isel -march=amdgcn -mcpu=tahiti -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6,GFX6-IEEE %s
4; RUN: llc -global-isel -march=amdgcn -mcpu=tahiti -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6,GFX6-FLUSH %s
5
6; RUN: llc -global-isel -march=amdgcn -mcpu=fiji -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX89,GFX8 %s
7; RUN: llc -global-isel -march=amdgcn -mcpu=fiji -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX89,GFX8 %s
8
9; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX89,GFX9 %s
10; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX89,GFX9 %s
11
12define half @v_fdiv_f16(half %a, half %b) {
13; GFX6-IEEE-LABEL: v_fdiv_f16:
14; GFX6-IEEE:       ; %bb.0:
15; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, v0
17; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, v1
18; GFX6-IEEE-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
19; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v3, v2
20; GFX6-IEEE-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
21; GFX6-IEEE-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
22; GFX6-IEEE-NEXT:    v_fma_f32 v3, v5, v3, v3
23; GFX6-IEEE-NEXT:    v_mul_f32_e32 v5, v4, v3
24; GFX6-IEEE-NEXT:    v_fma_f32 v6, -v2, v5, v4
25; GFX6-IEEE-NEXT:    v_fma_f32 v5, v6, v3, v5
26; GFX6-IEEE-NEXT:    v_fma_f32 v2, -v2, v5, v4
27; GFX6-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
28; GFX6-IEEE-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
29; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
30; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
31;
32; GFX6-FLUSH-LABEL: v_fdiv_f16:
33; GFX6-FLUSH:       ; %bb.0:
34; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, v0
36; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, v1
37; GFX6-FLUSH-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
38; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v3, v2
39; GFX6-FLUSH-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
40; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
41; GFX6-FLUSH-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
42; GFX6-FLUSH-NEXT:    v_fma_f32 v3, v5, v3, v3
43; GFX6-FLUSH-NEXT:    v_mul_f32_e32 v5, v4, v3
44; GFX6-FLUSH-NEXT:    v_fma_f32 v6, -v2, v5, v4
45; GFX6-FLUSH-NEXT:    v_fma_f32 v5, v6, v3, v5
46; GFX6-FLUSH-NEXT:    v_fma_f32 v2, -v2, v5, v4
47; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
48; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
49; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
50; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
51; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v0, v0
52; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
53;
54; GFX89-LABEL: v_fdiv_f16:
55; GFX89:       ; %bb.0:
56; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
57; GFX89-NEXT:    v_cvt_f32_f16_e32 v2, v1
58; GFX89-NEXT:    v_cvt_f32_f16_e32 v3, v0
59; GFX89-NEXT:    v_rcp_f32_e32 v2, v2
60; GFX89-NEXT:    v_mul_f32_e32 v2, v3, v2
61; GFX89-NEXT:    v_cvt_f16_f32_e32 v2, v2
62; GFX89-NEXT:    v_div_fixup_f16 v0, v2, v1, v0
63; GFX89-NEXT:    s_setpc_b64 s[30:31]
64  %fdiv = fdiv half %a, %b
65  ret half %fdiv
66}
67
68define half @v_fdiv_f16_afn(half %a, half %b) {
69; GFX6-LABEL: v_fdiv_f16_afn:
70; GFX6:       ; %bb.0:
71; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
72; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
73; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
74; GFX6-NEXT:    v_rcp_f32_e32 v1, v1
75; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v1
76; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
77; GFX6-NEXT:    s_setpc_b64 s[30:31]
78;
79; GFX89-LABEL: v_fdiv_f16_afn:
80; GFX89:       ; %bb.0:
81; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
82; GFX89-NEXT:    v_rcp_f16_e32 v1, v1
83; GFX89-NEXT:    v_mul_f16_e32 v0, v0, v1
84; GFX89-NEXT:    s_setpc_b64 s[30:31]
85  %fdiv = fdiv afn half %a, %b
86  ret half %fdiv
87}
88
89define half @v_fdiv_f16_ulp25(half %a, half %b) {
90; GFX6-IEEE-LABEL: v_fdiv_f16_ulp25:
91; GFX6-IEEE:       ; %bb.0:
92; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
93; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, v0
94; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, v1
95; GFX6-IEEE-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
96; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v3, v2
97; GFX6-IEEE-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
98; GFX6-IEEE-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
99; GFX6-IEEE-NEXT:    v_fma_f32 v3, v5, v3, v3
100; GFX6-IEEE-NEXT:    v_mul_f32_e32 v5, v4, v3
101; GFX6-IEEE-NEXT:    v_fma_f32 v6, -v2, v5, v4
102; GFX6-IEEE-NEXT:    v_fma_f32 v5, v6, v3, v5
103; GFX6-IEEE-NEXT:    v_fma_f32 v2, -v2, v5, v4
104; GFX6-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
105; GFX6-IEEE-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
106; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
107; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
108;
109; GFX6-FLUSH-LABEL: v_fdiv_f16_ulp25:
110; GFX6-FLUSH:       ; %bb.0:
111; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
112; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, v0
113; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, v1
114; GFX6-FLUSH-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
115; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v3, v2
116; GFX6-FLUSH-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
117; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
118; GFX6-FLUSH-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
119; GFX6-FLUSH-NEXT:    v_fma_f32 v3, v5, v3, v3
120; GFX6-FLUSH-NEXT:    v_mul_f32_e32 v5, v4, v3
121; GFX6-FLUSH-NEXT:    v_fma_f32 v6, -v2, v5, v4
122; GFX6-FLUSH-NEXT:    v_fma_f32 v5, v6, v3, v5
123; GFX6-FLUSH-NEXT:    v_fma_f32 v2, -v2, v5, v4
124; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
125; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
126; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
127; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
128; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v0, v0
129; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
130;
131; GFX89-LABEL: v_fdiv_f16_ulp25:
132; GFX89:       ; %bb.0:
133; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
134; GFX89-NEXT:    v_cvt_f32_f16_e32 v2, v1
135; GFX89-NEXT:    v_cvt_f32_f16_e32 v3, v0
136; GFX89-NEXT:    v_rcp_f32_e32 v2, v2
137; GFX89-NEXT:    v_mul_f32_e32 v2, v3, v2
138; GFX89-NEXT:    v_cvt_f16_f32_e32 v2, v2
139; GFX89-NEXT:    v_div_fixup_f16 v0, v2, v1, v0
140; GFX89-NEXT:    s_setpc_b64 s[30:31]
141  %fdiv = fdiv half %a, %b, !fpmath !0
142  ret half %fdiv
143}
144
145define half @v_rcp_f16(half %x) {
146; GFX6-IEEE-LABEL: v_rcp_f16:
147; GFX6-IEEE:       ; %bb.0:
148; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
149; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, 1.0
150; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, v0
151; GFX6-IEEE-NEXT:    v_div_scale_f32 v2, s[4:5], v0, v0, v1
152; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v3, v2
153; GFX6-IEEE-NEXT:    v_div_scale_f32 v4, vcc, v1, v0, v1
154; GFX6-IEEE-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
155; GFX6-IEEE-NEXT:    v_fma_f32 v3, v5, v3, v3
156; GFX6-IEEE-NEXT:    v_mul_f32_e32 v5, v4, v3
157; GFX6-IEEE-NEXT:    v_fma_f32 v6, -v2, v5, v4
158; GFX6-IEEE-NEXT:    v_fma_f32 v5, v6, v3, v5
159; GFX6-IEEE-NEXT:    v_fma_f32 v2, -v2, v5, v4
160; GFX6-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
161; GFX6-IEEE-NEXT:    v_div_fixup_f32 v0, v2, v0, v1
162; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
163; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
164;
165; GFX6-FLUSH-LABEL: v_rcp_f16:
166; GFX6-FLUSH:       ; %bb.0:
167; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
168; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, 1.0
169; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, v0
170; GFX6-FLUSH-NEXT:    v_div_scale_f32 v2, s[4:5], v0, v0, v1
171; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v3, v2
172; GFX6-FLUSH-NEXT:    v_div_scale_f32 v4, vcc, v1, v0, v1
173; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
174; GFX6-FLUSH-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
175; GFX6-FLUSH-NEXT:    v_fma_f32 v3, v5, v3, v3
176; GFX6-FLUSH-NEXT:    v_mul_f32_e32 v5, v4, v3
177; GFX6-FLUSH-NEXT:    v_fma_f32 v6, -v2, v5, v4
178; GFX6-FLUSH-NEXT:    v_fma_f32 v5, v6, v3, v5
179; GFX6-FLUSH-NEXT:    v_fma_f32 v2, -v2, v5, v4
180; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
181; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
182; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v0, v2, v0, v1
183; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
184; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v0, v0
185; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
186;
187; GFX89-LABEL: v_rcp_f16:
188; GFX89:       ; %bb.0:
189; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
190; GFX89-NEXT:    v_cvt_f32_f16_e32 v1, v0
191; GFX89-NEXT:    v_cvt_f32_f16_e32 v2, 1.0
192; GFX89-NEXT:    v_rcp_f32_e32 v1, v1
193; GFX89-NEXT:    v_mul_f32_e32 v1, v2, v1
194; GFX89-NEXT:    v_cvt_f16_f32_e32 v1, v1
195; GFX89-NEXT:    v_div_fixup_f16 v0, v1, v0, 1.0
196; GFX89-NEXT:    s_setpc_b64 s[30:31]
197  %fdiv = fdiv half 1.0, %x
198  ret half %fdiv
199}
200
201define half @v_rcp_f16_arcp(half %x) {
202; GFX6-IEEE-LABEL: v_rcp_f16_arcp:
203; GFX6-IEEE:       ; %bb.0:
204; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
205; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, 1.0
206; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, v0
207; GFX6-IEEE-NEXT:    v_div_scale_f32 v2, s[4:5], v0, v0, v1
208; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v3, v2
209; GFX6-IEEE-NEXT:    v_div_scale_f32 v4, vcc, v1, v0, v1
210; GFX6-IEEE-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
211; GFX6-IEEE-NEXT:    v_fma_f32 v3, v5, v3, v3
212; GFX6-IEEE-NEXT:    v_mul_f32_e32 v5, v4, v3
213; GFX6-IEEE-NEXT:    v_fma_f32 v6, -v2, v5, v4
214; GFX6-IEEE-NEXT:    v_fma_f32 v5, v6, v3, v5
215; GFX6-IEEE-NEXT:    v_fma_f32 v2, -v2, v5, v4
216; GFX6-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
217; GFX6-IEEE-NEXT:    v_div_fixup_f32 v0, v2, v0, v1
218; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
219; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
220;
221; GFX6-FLUSH-LABEL: v_rcp_f16_arcp:
222; GFX6-FLUSH:       ; %bb.0:
223; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
224; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, 1.0
225; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, v0
226; GFX6-FLUSH-NEXT:    v_div_scale_f32 v2, s[4:5], v0, v0, v1
227; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v3, v2
228; GFX6-FLUSH-NEXT:    v_div_scale_f32 v4, vcc, v1, v0, v1
229; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
230; GFX6-FLUSH-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
231; GFX6-FLUSH-NEXT:    v_fma_f32 v3, v5, v3, v3
232; GFX6-FLUSH-NEXT:    v_mul_f32_e32 v5, v4, v3
233; GFX6-FLUSH-NEXT:    v_fma_f32 v6, -v2, v5, v4
234; GFX6-FLUSH-NEXT:    v_fma_f32 v5, v6, v3, v5
235; GFX6-FLUSH-NEXT:    v_fma_f32 v2, -v2, v5, v4
236; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
237; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
238; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v0, v2, v0, v1
239; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
240; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v0, v0
241; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
242;
243; GFX89-LABEL: v_rcp_f16_arcp:
244; GFX89:       ; %bb.0:
245; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
246; GFX89-NEXT:    v_cvt_f32_f16_e32 v1, v0
247; GFX89-NEXT:    v_cvt_f32_f16_e32 v2, 1.0
248; GFX89-NEXT:    v_rcp_f32_e32 v1, v1
249; GFX89-NEXT:    v_mul_f32_e32 v1, v2, v1
250; GFX89-NEXT:    v_cvt_f16_f32_e32 v1, v1
251; GFX89-NEXT:    v_div_fixup_f16 v0, v1, v0, 1.0
252; GFX89-NEXT:    s_setpc_b64 s[30:31]
253  %fdiv = fdiv arcp half 1.0, %x
254  ret half %fdiv
255}
256
257define half @v_rcp_f16_arcp_afn(half %x) {
258; GFX6-LABEL: v_rcp_f16_arcp_afn:
259; GFX6:       ; %bb.0:
260; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
261; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
262; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, 1.0
263; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
264; GFX6-NEXT:    v_mul_f32_e32 v0, v1, v0
265; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
266; GFX6-NEXT:    s_setpc_b64 s[30:31]
267;
268; GFX89-LABEL: v_rcp_f16_arcp_afn:
269; GFX89:       ; %bb.0:
270; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
271; GFX89-NEXT:    v_rcp_f16_e32 v0, v0
272; GFX89-NEXT:    s_setpc_b64 s[30:31]
273  %fdiv = fdiv arcp afn half 1.0, %x
274  ret half %fdiv
275}
276
277define half @v_rcp_f16_ulp25(half %x) {
278; GFX6-IEEE-LABEL: v_rcp_f16_ulp25:
279; GFX6-IEEE:       ; %bb.0:
280; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
281; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, 1.0
282; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, v0
283; GFX6-IEEE-NEXT:    v_div_scale_f32 v2, s[4:5], v0, v0, v1
284; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v3, v2
285; GFX6-IEEE-NEXT:    v_div_scale_f32 v4, vcc, v1, v0, v1
286; GFX6-IEEE-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
287; GFX6-IEEE-NEXT:    v_fma_f32 v3, v5, v3, v3
288; GFX6-IEEE-NEXT:    v_mul_f32_e32 v5, v4, v3
289; GFX6-IEEE-NEXT:    v_fma_f32 v6, -v2, v5, v4
290; GFX6-IEEE-NEXT:    v_fma_f32 v5, v6, v3, v5
291; GFX6-IEEE-NEXT:    v_fma_f32 v2, -v2, v5, v4
292; GFX6-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
293; GFX6-IEEE-NEXT:    v_div_fixup_f32 v0, v2, v0, v1
294; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
295; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
296;
297; GFX6-FLUSH-LABEL: v_rcp_f16_ulp25:
298; GFX6-FLUSH:       ; %bb.0:
299; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
300; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, 1.0
301; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, v0
302; GFX6-FLUSH-NEXT:    v_div_scale_f32 v2, s[4:5], v0, v0, v1
303; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v3, v2
304; GFX6-FLUSH-NEXT:    v_div_scale_f32 v4, vcc, v1, v0, v1
305; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
306; GFX6-FLUSH-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
307; GFX6-FLUSH-NEXT:    v_fma_f32 v3, v5, v3, v3
308; GFX6-FLUSH-NEXT:    v_mul_f32_e32 v5, v4, v3
309; GFX6-FLUSH-NEXT:    v_fma_f32 v6, -v2, v5, v4
310; GFX6-FLUSH-NEXT:    v_fma_f32 v5, v6, v3, v5
311; GFX6-FLUSH-NEXT:    v_fma_f32 v2, -v2, v5, v4
312; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
313; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
314; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v0, v2, v0, v1
315; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
316; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v0, v0
317; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
318;
319; GFX89-LABEL: v_rcp_f16_ulp25:
320; GFX89:       ; %bb.0:
321; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
322; GFX89-NEXT:    v_rcp_f16_e32 v0, v0
323; GFX89-NEXT:    s_setpc_b64 s[30:31]
324  %fdiv = fdiv half 1.0, %x, !fpmath !0
325  ret half %fdiv
326}
327
328define half @v_fdiv_f16_afn_ulp25(half %a, half %b) {
329; GFX6-LABEL: v_fdiv_f16_afn_ulp25:
330; GFX6:       ; %bb.0:
331; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
332; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
333; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
334; GFX6-NEXT:    v_rcp_f32_e32 v1, v1
335; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v1
336; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
337; GFX6-NEXT:    s_setpc_b64 s[30:31]
338;
339; GFX89-LABEL: v_fdiv_f16_afn_ulp25:
340; GFX89:       ; %bb.0:
341; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
342; GFX89-NEXT:    v_rcp_f16_e32 v1, v1
343; GFX89-NEXT:    v_mul_f16_e32 v0, v0, v1
344; GFX89-NEXT:    s_setpc_b64 s[30:31]
345  %fdiv = fdiv afn half %a, %b, !fpmath !0
346  ret half %fdiv
347}
348
349define half @v_fdiv_f16_arcp_ulp25(half %a, half %b) {
350; GFX6-IEEE-LABEL: v_fdiv_f16_arcp_ulp25:
351; GFX6-IEEE:       ; %bb.0:
352; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
353; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, v0
354; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, v1
355; GFX6-IEEE-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
356; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v3, v2
357; GFX6-IEEE-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
358; GFX6-IEEE-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
359; GFX6-IEEE-NEXT:    v_fma_f32 v3, v5, v3, v3
360; GFX6-IEEE-NEXT:    v_mul_f32_e32 v5, v4, v3
361; GFX6-IEEE-NEXT:    v_fma_f32 v6, -v2, v5, v4
362; GFX6-IEEE-NEXT:    v_fma_f32 v5, v6, v3, v5
363; GFX6-IEEE-NEXT:    v_fma_f32 v2, -v2, v5, v4
364; GFX6-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
365; GFX6-IEEE-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
366; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
367; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
368;
369; GFX6-FLUSH-LABEL: v_fdiv_f16_arcp_ulp25:
370; GFX6-FLUSH:       ; %bb.0:
371; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
372; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, v0
373; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, v1
374; GFX6-FLUSH-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
375; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v3, v2
376; GFX6-FLUSH-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
377; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
378; GFX6-FLUSH-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
379; GFX6-FLUSH-NEXT:    v_fma_f32 v3, v5, v3, v3
380; GFX6-FLUSH-NEXT:    v_mul_f32_e32 v5, v4, v3
381; GFX6-FLUSH-NEXT:    v_fma_f32 v6, -v2, v5, v4
382; GFX6-FLUSH-NEXT:    v_fma_f32 v5, v6, v3, v5
383; GFX6-FLUSH-NEXT:    v_fma_f32 v2, -v2, v5, v4
384; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
385; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
386; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
387; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
388; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v0, v0
389; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
390;
391; GFX89-LABEL: v_fdiv_f16_arcp_ulp25:
392; GFX89:       ; %bb.0:
393; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
394; GFX89-NEXT:    v_cvt_f32_f16_e32 v2, v1
395; GFX89-NEXT:    v_cvt_f32_f16_e32 v3, v0
396; GFX89-NEXT:    v_rcp_f32_e32 v2, v2
397; GFX89-NEXT:    v_mul_f32_e32 v2, v3, v2
398; GFX89-NEXT:    v_cvt_f16_f32_e32 v2, v2
399; GFX89-NEXT:    v_div_fixup_f16 v0, v2, v1, v0
400; GFX89-NEXT:    s_setpc_b64 s[30:31]
401  %fdiv = fdiv arcp half %a, %b, !fpmath !0
402  ret half %fdiv
403}
404
405define <2 x half> @v_fdiv_v2f16(<2 x half> %a, <2 x half> %b) {
406; GFX6-IEEE-LABEL: v_fdiv_v2f16:
407; GFX6-IEEE:       ; %bb.0:
408; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
409; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, v0
410; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v2, v2
411; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, v1
412; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v3, v3
413; GFX6-IEEE-NEXT:    v_div_scale_f32 v4, s[4:5], v2, v2, v0
414; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v5, v4
415; GFX6-IEEE-NEXT:    v_div_scale_f32 v6, vcc, v0, v2, v0
416; GFX6-IEEE-NEXT:    v_fma_f32 v7, -v4, v5, 1.0
417; GFX6-IEEE-NEXT:    v_fma_f32 v5, v7, v5, v5
418; GFX6-IEEE-NEXT:    v_mul_f32_e32 v7, v6, v5
419; GFX6-IEEE-NEXT:    v_fma_f32 v8, -v4, v7, v6
420; GFX6-IEEE-NEXT:    v_fma_f32 v7, v8, v5, v7
421; GFX6-IEEE-NEXT:    v_fma_f32 v4, -v4, v7, v6
422; GFX6-IEEE-NEXT:    v_div_fmas_f32 v4, v4, v5, v7
423; GFX6-IEEE-NEXT:    v_div_fixup_f32 v0, v4, v2, v0
424; GFX6-IEEE-NEXT:    v_div_scale_f32 v2, s[4:5], v3, v3, v1
425; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v4, v2
426; GFX6-IEEE-NEXT:    v_div_scale_f32 v5, vcc, v1, v3, v1
427; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
428; GFX6-IEEE-NEXT:    v_fma_f32 v6, -v2, v4, 1.0
429; GFX6-IEEE-NEXT:    v_fma_f32 v4, v6, v4, v4
430; GFX6-IEEE-NEXT:    v_mul_f32_e32 v6, v5, v4
431; GFX6-IEEE-NEXT:    v_fma_f32 v7, -v2, v6, v5
432; GFX6-IEEE-NEXT:    v_fma_f32 v6, v7, v4, v6
433; GFX6-IEEE-NEXT:    v_fma_f32 v2, -v2, v6, v5
434; GFX6-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v4, v6
435; GFX6-IEEE-NEXT:    v_div_fixup_f32 v1, v2, v3, v1
436; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v1, v1
437; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
438;
439; GFX6-FLUSH-LABEL: v_fdiv_v2f16:
440; GFX6-FLUSH:       ; %bb.0:
441; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
442; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, v0
443; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, v2
444; GFX6-FLUSH-NEXT:    v_div_scale_f32 v4, s[4:5], v2, v2, v0
445; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v5, v4
446; GFX6-FLUSH-NEXT:    v_div_scale_f32 v6, vcc, v0, v2, v0
447; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
448; GFX6-FLUSH-NEXT:    v_fma_f32 v7, -v4, v5, 1.0
449; GFX6-FLUSH-NEXT:    v_fma_f32 v5, v7, v5, v5
450; GFX6-FLUSH-NEXT:    v_mul_f32_e32 v7, v6, v5
451; GFX6-FLUSH-NEXT:    v_fma_f32 v8, -v4, v7, v6
452; GFX6-FLUSH-NEXT:    v_fma_f32 v7, v8, v5, v7
453; GFX6-FLUSH-NEXT:    v_fma_f32 v4, -v4, v7, v6
454; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
455; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, v1
456; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v3, v3
457; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v4, v4, v5, v7
458; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v0, v4, v2, v0
459; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
460; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v0, v0
461; GFX6-FLUSH-NEXT:    v_div_scale_f32 v2, s[4:5], v3, v3, v1
462; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v4, v2
463; GFX6-FLUSH-NEXT:    v_div_scale_f32 v5, vcc, v1, v3, v1
464; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
465; GFX6-FLUSH-NEXT:    v_fma_f32 v6, -v2, v4, 1.0
466; GFX6-FLUSH-NEXT:    v_fma_f32 v4, v6, v4, v4
467; GFX6-FLUSH-NEXT:    v_mul_f32_e32 v6, v5, v4
468; GFX6-FLUSH-NEXT:    v_fma_f32 v7, -v2, v6, v5
469; GFX6-FLUSH-NEXT:    v_fma_f32 v6, v7, v4, v6
470; GFX6-FLUSH-NEXT:    v_fma_f32 v2, -v2, v6, v5
471; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
472; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v4, v6
473; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v1, v2, v3, v1
474; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v1, v1
475; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
476;
477; GFX8-LABEL: v_fdiv_v2f16:
478; GFX8:       ; %bb.0:
479; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
480; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
481; GFX8-NEXT:    v_cvt_f32_f16_e32 v2, v1
482; GFX8-NEXT:    v_cvt_f32_f16_e32 v5, v4
483; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
484; GFX8-NEXT:    v_cvt_f32_f16_e32 v3, v0
485; GFX8-NEXT:    v_rcp_f32_e32 v2, v2
486; GFX8-NEXT:    v_cvt_f32_f16_e32 v7, v6
487; GFX8-NEXT:    v_rcp_f32_e32 v5, v5
488; GFX8-NEXT:    v_mul_f32_e32 v2, v3, v2
489; GFX8-NEXT:    v_cvt_f16_f32_e32 v2, v2
490; GFX8-NEXT:    v_mul_f32_e32 v3, v7, v5
491; GFX8-NEXT:    v_cvt_f16_f32_e32 v3, v3
492; GFX8-NEXT:    v_div_fixup_f16 v0, v2, v1, v0
493; GFX8-NEXT:    v_mov_b32_e32 v2, 16
494; GFX8-NEXT:    v_div_fixup_f16 v1, v3, v4, v6
495; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
496; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
497; GFX8-NEXT:    s_setpc_b64 s[30:31]
498;
499; GFX9-LABEL: v_fdiv_v2f16:
500; GFX9:       ; %bb.0:
501; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
502; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
503; GFX9-NEXT:    v_cvt_f32_f16_e32 v2, v1
504; GFX9-NEXT:    v_cvt_f32_f16_e32 v5, v4
505; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
506; GFX9-NEXT:    v_cvt_f32_f16_e32 v3, v0
507; GFX9-NEXT:    v_rcp_f32_e32 v2, v2
508; GFX9-NEXT:    v_cvt_f32_f16_e32 v7, v6
509; GFX9-NEXT:    v_rcp_f32_e32 v5, v5
510; GFX9-NEXT:    v_mul_f32_e32 v2, v3, v2
511; GFX9-NEXT:    v_cvt_f16_f32_e32 v2, v2
512; GFX9-NEXT:    v_mul_f32_e32 v3, v7, v5
513; GFX9-NEXT:    v_cvt_f16_f32_e32 v3, v3
514; GFX9-NEXT:    v_div_fixup_f16 v0, v2, v1, v0
515; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffff
516; GFX9-NEXT:    v_div_fixup_f16 v1, v3, v4, v6
517; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
518; GFX9-NEXT:    v_and_or_b32 v0, v0, v2, v1
519; GFX9-NEXT:    s_setpc_b64 s[30:31]
520  %fdiv = fdiv <2 x half> %a, %b
521  ret <2 x half> %fdiv
522}
523
524define <2 x half> @v_fdiv_v2f16_afn(<2 x half> %a, <2 x half> %b) {
525; GFX6-LABEL: v_fdiv_v2f16_afn:
526; GFX6:       ; %bb.0:
527; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
528; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
529; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
530; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
531; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
532; GFX6-NEXT:    v_rcp_f32_e32 v2, v2
533; GFX6-NEXT:    v_rcp_f32_e32 v3, v3
534; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v2
535; GFX6-NEXT:    v_mul_f32_e32 v1, v1, v3
536; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
537; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
538; GFX6-NEXT:    s_setpc_b64 s[30:31]
539;
540; GFX8-LABEL: v_fdiv_v2f16_afn:
541; GFX8:       ; %bb.0:
542; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
543; GFX8-NEXT:    v_rcp_f16_e32 v2, v1
544; GFX8-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
545; GFX8-NEXT:    v_mul_f16_e32 v2, v0, v2
546; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
547; GFX8-NEXT:    v_mov_b32_e32 v1, 16
548; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
549; GFX8-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
550; GFX8-NEXT:    s_setpc_b64 s[30:31]
551;
552; GFX9-LABEL: v_fdiv_v2f16_afn:
553; GFX9:       ; %bb.0:
554; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
555; GFX9-NEXT:    v_rcp_f16_e32 v2, v1
556; GFX9-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
557; GFX9-NEXT:    v_mul_f16_e32 v2, v0, v2
558; GFX9-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
559; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffff
560; GFX9-NEXT:    v_and_or_b32 v0, v2, v1, v0
561; GFX9-NEXT:    s_setpc_b64 s[30:31]
562  %fdiv = fdiv afn <2 x half> %a, %b
563  ret <2 x half> %fdiv
564}
565
566define <2 x half> @v_fdiv_v2f16_ulp25(<2 x half> %a, <2 x half> %b) {
567; GFX6-IEEE-LABEL: v_fdiv_v2f16_ulp25:
568; GFX6-IEEE:       ; %bb.0:
569; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
570; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, v0
571; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v2, v2
572; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, v1
573; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v3, v3
574; GFX6-IEEE-NEXT:    v_div_scale_f32 v4, s[4:5], v2, v2, v0
575; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v5, v4
576; GFX6-IEEE-NEXT:    v_div_scale_f32 v6, vcc, v0, v2, v0
577; GFX6-IEEE-NEXT:    v_fma_f32 v7, -v4, v5, 1.0
578; GFX6-IEEE-NEXT:    v_fma_f32 v5, v7, v5, v5
579; GFX6-IEEE-NEXT:    v_mul_f32_e32 v7, v6, v5
580; GFX6-IEEE-NEXT:    v_fma_f32 v8, -v4, v7, v6
581; GFX6-IEEE-NEXT:    v_fma_f32 v7, v8, v5, v7
582; GFX6-IEEE-NEXT:    v_fma_f32 v4, -v4, v7, v6
583; GFX6-IEEE-NEXT:    v_div_fmas_f32 v4, v4, v5, v7
584; GFX6-IEEE-NEXT:    v_div_fixup_f32 v0, v4, v2, v0
585; GFX6-IEEE-NEXT:    v_div_scale_f32 v2, s[4:5], v3, v3, v1
586; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v4, v2
587; GFX6-IEEE-NEXT:    v_div_scale_f32 v5, vcc, v1, v3, v1
588; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
589; GFX6-IEEE-NEXT:    v_fma_f32 v6, -v2, v4, 1.0
590; GFX6-IEEE-NEXT:    v_fma_f32 v4, v6, v4, v4
591; GFX6-IEEE-NEXT:    v_mul_f32_e32 v6, v5, v4
592; GFX6-IEEE-NEXT:    v_fma_f32 v7, -v2, v6, v5
593; GFX6-IEEE-NEXT:    v_fma_f32 v6, v7, v4, v6
594; GFX6-IEEE-NEXT:    v_fma_f32 v2, -v2, v6, v5
595; GFX6-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v4, v6
596; GFX6-IEEE-NEXT:    v_div_fixup_f32 v1, v2, v3, v1
597; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v1, v1
598; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
599;
600; GFX6-FLUSH-LABEL: v_fdiv_v2f16_ulp25:
601; GFX6-FLUSH:       ; %bb.0:
602; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
603; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, v0
604; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, v2
605; GFX6-FLUSH-NEXT:    v_div_scale_f32 v4, s[4:5], v2, v2, v0
606; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v5, v4
607; GFX6-FLUSH-NEXT:    v_div_scale_f32 v6, vcc, v0, v2, v0
608; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
609; GFX6-FLUSH-NEXT:    v_fma_f32 v7, -v4, v5, 1.0
610; GFX6-FLUSH-NEXT:    v_fma_f32 v5, v7, v5, v5
611; GFX6-FLUSH-NEXT:    v_mul_f32_e32 v7, v6, v5
612; GFX6-FLUSH-NEXT:    v_fma_f32 v8, -v4, v7, v6
613; GFX6-FLUSH-NEXT:    v_fma_f32 v7, v8, v5, v7
614; GFX6-FLUSH-NEXT:    v_fma_f32 v4, -v4, v7, v6
615; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
616; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, v1
617; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v3, v3
618; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v4, v4, v5, v7
619; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v0, v4, v2, v0
620; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
621; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v0, v0
622; GFX6-FLUSH-NEXT:    v_div_scale_f32 v2, s[4:5], v3, v3, v1
623; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v4, v2
624; GFX6-FLUSH-NEXT:    v_div_scale_f32 v5, vcc, v1, v3, v1
625; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
626; GFX6-FLUSH-NEXT:    v_fma_f32 v6, -v2, v4, 1.0
627; GFX6-FLUSH-NEXT:    v_fma_f32 v4, v6, v4, v4
628; GFX6-FLUSH-NEXT:    v_mul_f32_e32 v6, v5, v4
629; GFX6-FLUSH-NEXT:    v_fma_f32 v7, -v2, v6, v5
630; GFX6-FLUSH-NEXT:    v_fma_f32 v6, v7, v4, v6
631; GFX6-FLUSH-NEXT:    v_fma_f32 v2, -v2, v6, v5
632; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
633; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v4, v6
634; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v1, v2, v3, v1
635; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v1, v1
636; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
637;
638; GFX8-LABEL: v_fdiv_v2f16_ulp25:
639; GFX8:       ; %bb.0:
640; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
641; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
642; GFX8-NEXT:    v_cvt_f32_f16_e32 v2, v1
643; GFX8-NEXT:    v_cvt_f32_f16_e32 v5, v4
644; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
645; GFX8-NEXT:    v_cvt_f32_f16_e32 v3, v0
646; GFX8-NEXT:    v_rcp_f32_e32 v2, v2
647; GFX8-NEXT:    v_cvt_f32_f16_e32 v7, v6
648; GFX8-NEXT:    v_rcp_f32_e32 v5, v5
649; GFX8-NEXT:    v_mul_f32_e32 v2, v3, v2
650; GFX8-NEXT:    v_cvt_f16_f32_e32 v2, v2
651; GFX8-NEXT:    v_mul_f32_e32 v3, v7, v5
652; GFX8-NEXT:    v_cvt_f16_f32_e32 v3, v3
653; GFX8-NEXT:    v_div_fixup_f16 v0, v2, v1, v0
654; GFX8-NEXT:    v_mov_b32_e32 v2, 16
655; GFX8-NEXT:    v_div_fixup_f16 v1, v3, v4, v6
656; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
657; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
658; GFX8-NEXT:    s_setpc_b64 s[30:31]
659;
660; GFX9-LABEL: v_fdiv_v2f16_ulp25:
661; GFX9:       ; %bb.0:
662; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
663; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
664; GFX9-NEXT:    v_cvt_f32_f16_e32 v2, v1
665; GFX9-NEXT:    v_cvt_f32_f16_e32 v5, v4
666; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
667; GFX9-NEXT:    v_cvt_f32_f16_e32 v3, v0
668; GFX9-NEXT:    v_rcp_f32_e32 v2, v2
669; GFX9-NEXT:    v_cvt_f32_f16_e32 v7, v6
670; GFX9-NEXT:    v_rcp_f32_e32 v5, v5
671; GFX9-NEXT:    v_mul_f32_e32 v2, v3, v2
672; GFX9-NEXT:    v_cvt_f16_f32_e32 v2, v2
673; GFX9-NEXT:    v_mul_f32_e32 v3, v7, v5
674; GFX9-NEXT:    v_cvt_f16_f32_e32 v3, v3
675; GFX9-NEXT:    v_div_fixup_f16 v0, v2, v1, v0
676; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffff
677; GFX9-NEXT:    v_div_fixup_f16 v1, v3, v4, v6
678; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
679; GFX9-NEXT:    v_and_or_b32 v0, v0, v2, v1
680; GFX9-NEXT:    s_setpc_b64 s[30:31]
681  %fdiv = fdiv <2 x half> %a, %b, !fpmath !0
682  ret <2 x half> %fdiv
683}
684
685define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
686; GFX6-IEEE-LABEL: v_rcp_v2f16:
687; GFX6-IEEE:       ; %bb.0:
688; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
689; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v2, 1.0
690; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, v0
691; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, v1
692; GFX6-IEEE-NEXT:    v_div_scale_f32 v3, s[4:5], v0, v0, v2
693; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v4, v3
694; GFX6-IEEE-NEXT:    v_div_scale_f32 v5, vcc, v2, v0, v2
695; GFX6-IEEE-NEXT:    v_fma_f32 v6, -v3, v4, 1.0
696; GFX6-IEEE-NEXT:    v_fma_f32 v4, v6, v4, v4
697; GFX6-IEEE-NEXT:    v_mul_f32_e32 v6, v5, v4
698; GFX6-IEEE-NEXT:    v_fma_f32 v7, -v3, v6, v5
699; GFX6-IEEE-NEXT:    v_fma_f32 v6, v7, v4, v6
700; GFX6-IEEE-NEXT:    v_fma_f32 v3, -v3, v6, v5
701; GFX6-IEEE-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
702; GFX6-IEEE-NEXT:    v_div_fixup_f32 v0, v3, v0, v2
703; GFX6-IEEE-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, v2
704; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v4, v3
705; GFX6-IEEE-NEXT:    v_div_scale_f32 v5, vcc, v2, v1, v2
706; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
707; GFX6-IEEE-NEXT:    v_fma_f32 v6, -v3, v4, 1.0
708; GFX6-IEEE-NEXT:    v_fma_f32 v4, v6, v4, v4
709; GFX6-IEEE-NEXT:    v_mul_f32_e32 v6, v5, v4
710; GFX6-IEEE-NEXT:    v_fma_f32 v7, -v3, v6, v5
711; GFX6-IEEE-NEXT:    v_fma_f32 v6, v7, v4, v6
712; GFX6-IEEE-NEXT:    v_fma_f32 v3, -v3, v6, v5
713; GFX6-IEEE-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
714; GFX6-IEEE-NEXT:    v_div_fixup_f32 v1, v3, v1, v2
715; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v1, v1
716; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
717;
718; GFX6-FLUSH-LABEL: v_rcp_v2f16:
719; GFX6-FLUSH:       ; %bb.0:
720; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
721; GFX6-FLUSH-NEXT:    s_movk_i32 s6, 0x3c00
722; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, s6
723; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, v0
724; GFX6-FLUSH-NEXT:    v_div_scale_f32 v3, s[4:5], v0, v0, v2
725; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v4, v3
726; GFX6-FLUSH-NEXT:    v_div_scale_f32 v5, vcc, v2, v0, v2
727; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
728; GFX6-FLUSH-NEXT:    v_fma_f32 v6, -v3, v4, 1.0
729; GFX6-FLUSH-NEXT:    v_fma_f32 v4, v6, v4, v4
730; GFX6-FLUSH-NEXT:    v_mul_f32_e32 v6, v5, v4
731; GFX6-FLUSH-NEXT:    v_fma_f32 v7, -v3, v6, v5
732; GFX6-FLUSH-NEXT:    v_fma_f32 v6, v7, v4, v6
733; GFX6-FLUSH-NEXT:    v_fma_f32 v3, -v3, v6, v5
734; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
735; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
736; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v4, s6
737; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, v1
738; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v0, v3, v0, v2
739; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
740; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v0, v0
741; GFX6-FLUSH-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v4
742; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v3, v2
743; GFX6-FLUSH-NEXT:    v_div_scale_f32 v5, vcc, v4, v1, v4
744; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
745; GFX6-FLUSH-NEXT:    v_fma_f32 v6, -v2, v3, 1.0
746; GFX6-FLUSH-NEXT:    v_fma_f32 v3, v6, v3, v3
747; GFX6-FLUSH-NEXT:    v_mul_f32_e32 v6, v5, v3
748; GFX6-FLUSH-NEXT:    v_fma_f32 v7, -v2, v6, v5
749; GFX6-FLUSH-NEXT:    v_fma_f32 v6, v7, v3, v6
750; GFX6-FLUSH-NEXT:    v_fma_f32 v2, -v2, v6, v5
751; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
752; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v3, v6
753; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v1, v2, v1, v4
754; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v1, v1
755; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
756;
757; GFX8-LABEL: v_rcp_v2f16:
758; GFX8:       ; %bb.0:
759; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
760; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
761; GFX8-NEXT:    v_cvt_f32_f16_e32 v1, v0
762; GFX8-NEXT:    v_cvt_f32_f16_e32 v3, v2
763; GFX8-NEXT:    v_cvt_f32_f16_e32 v4, 1.0
764; GFX8-NEXT:    v_rcp_f32_e32 v1, v1
765; GFX8-NEXT:    v_rcp_f32_e32 v3, v3
766; GFX8-NEXT:    v_mul_f32_e32 v1, v4, v1
767; GFX8-NEXT:    v_mul_f32_e32 v3, v4, v3
768; GFX8-NEXT:    v_cvt_f16_f32_e32 v1, v1
769; GFX8-NEXT:    v_cvt_f16_f32_e32 v3, v3
770; GFX8-NEXT:    v_div_fixup_f16 v0, v1, v0, 1.0
771; GFX8-NEXT:    v_div_fixup_f16 v1, v3, v2, 1.0
772; GFX8-NEXT:    v_mov_b32_e32 v2, 16
773; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
774; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
775; GFX8-NEXT:    s_setpc_b64 s[30:31]
776;
777; GFX9-LABEL: v_rcp_v2f16:
778; GFX9:       ; %bb.0:
779; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
780; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
781; GFX9-NEXT:    v_cvt_f32_f16_e32 v1, v0
782; GFX9-NEXT:    v_cvt_f32_f16_e32 v3, v2
783; GFX9-NEXT:    v_cvt_f32_f16_e32 v4, 1.0
784; GFX9-NEXT:    v_rcp_f32_e32 v1, v1
785; GFX9-NEXT:    v_rcp_f32_e32 v3, v3
786; GFX9-NEXT:    v_mul_f32_e32 v1, v4, v1
787; GFX9-NEXT:    v_mul_f32_e32 v3, v4, v3
788; GFX9-NEXT:    v_cvt_f16_f32_e32 v1, v1
789; GFX9-NEXT:    v_cvt_f16_f32_e32 v3, v3
790; GFX9-NEXT:    v_div_fixup_f16 v0, v1, v0, 1.0
791; GFX9-NEXT:    v_div_fixup_f16 v1, v3, v2, 1.0
792; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffff
793; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
794; GFX9-NEXT:    v_and_or_b32 v0, v0, v2, v1
795; GFX9-NEXT:    s_setpc_b64 s[30:31]
796  %fdiv = fdiv <2 x half> <half 1.0, half 1.0>, %x
797  ret <2 x half> %fdiv
798}
799
800define <2 x half> @v_rcp_v2f16_arcp(<2 x half> %x) {
801; GFX6-IEEE-LABEL: v_rcp_v2f16_arcp:
802; GFX6-IEEE:       ; %bb.0:
803; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
804; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v2, 1.0
805; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, v0
806; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, v1
807; GFX6-IEEE-NEXT:    v_div_scale_f32 v3, s[4:5], v0, v0, v2
808; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v4, v3
809; GFX6-IEEE-NEXT:    v_div_scale_f32 v5, vcc, v2, v0, v2
810; GFX6-IEEE-NEXT:    v_fma_f32 v6, -v3, v4, 1.0
811; GFX6-IEEE-NEXT:    v_fma_f32 v4, v6, v4, v4
812; GFX6-IEEE-NEXT:    v_mul_f32_e32 v6, v5, v4
813; GFX6-IEEE-NEXT:    v_fma_f32 v7, -v3, v6, v5
814; GFX6-IEEE-NEXT:    v_fma_f32 v6, v7, v4, v6
815; GFX6-IEEE-NEXT:    v_fma_f32 v3, -v3, v6, v5
816; GFX6-IEEE-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
817; GFX6-IEEE-NEXT:    v_div_fixup_f32 v0, v3, v0, v2
818; GFX6-IEEE-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, v2
819; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v4, v3
820; GFX6-IEEE-NEXT:    v_div_scale_f32 v5, vcc, v2, v1, v2
821; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
822; GFX6-IEEE-NEXT:    v_fma_f32 v6, -v3, v4, 1.0
823; GFX6-IEEE-NEXT:    v_fma_f32 v4, v6, v4, v4
824; GFX6-IEEE-NEXT:    v_mul_f32_e32 v6, v5, v4
825; GFX6-IEEE-NEXT:    v_fma_f32 v7, -v3, v6, v5
826; GFX6-IEEE-NEXT:    v_fma_f32 v6, v7, v4, v6
827; GFX6-IEEE-NEXT:    v_fma_f32 v3, -v3, v6, v5
828; GFX6-IEEE-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
829; GFX6-IEEE-NEXT:    v_div_fixup_f32 v1, v3, v1, v2
830; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v1, v1
831; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
832;
833; GFX6-FLUSH-LABEL: v_rcp_v2f16_arcp:
834; GFX6-FLUSH:       ; %bb.0:
835; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
836; GFX6-FLUSH-NEXT:    s_movk_i32 s6, 0x3c00
837; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, s6
838; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, v0
839; GFX6-FLUSH-NEXT:    v_div_scale_f32 v3, s[4:5], v0, v0, v2
840; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v4, v3
841; GFX6-FLUSH-NEXT:    v_div_scale_f32 v5, vcc, v2, v0, v2
842; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
843; GFX6-FLUSH-NEXT:    v_fma_f32 v6, -v3, v4, 1.0
844; GFX6-FLUSH-NEXT:    v_fma_f32 v4, v6, v4, v4
845; GFX6-FLUSH-NEXT:    v_mul_f32_e32 v6, v5, v4
846; GFX6-FLUSH-NEXT:    v_fma_f32 v7, -v3, v6, v5
847; GFX6-FLUSH-NEXT:    v_fma_f32 v6, v7, v4, v6
848; GFX6-FLUSH-NEXT:    v_fma_f32 v3, -v3, v6, v5
849; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
850; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
851; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v4, s6
852; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, v1
853; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v0, v3, v0, v2
854; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
855; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v0, v0
856; GFX6-FLUSH-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v4
857; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v3, v2
858; GFX6-FLUSH-NEXT:    v_div_scale_f32 v5, vcc, v4, v1, v4
859; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
860; GFX6-FLUSH-NEXT:    v_fma_f32 v6, -v2, v3, 1.0
861; GFX6-FLUSH-NEXT:    v_fma_f32 v3, v6, v3, v3
862; GFX6-FLUSH-NEXT:    v_mul_f32_e32 v6, v5, v3
863; GFX6-FLUSH-NEXT:    v_fma_f32 v7, -v2, v6, v5
864; GFX6-FLUSH-NEXT:    v_fma_f32 v6, v7, v3, v6
865; GFX6-FLUSH-NEXT:    v_fma_f32 v2, -v2, v6, v5
866; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
867; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v3, v6
868; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v1, v2, v1, v4
869; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v1, v1
870; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
871;
872; GFX8-LABEL: v_rcp_v2f16_arcp:
873; GFX8:       ; %bb.0:
874; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
875; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
876; GFX8-NEXT:    v_cvt_f32_f16_e32 v1, v0
877; GFX8-NEXT:    v_cvt_f32_f16_e32 v3, v2
878; GFX8-NEXT:    v_cvt_f32_f16_e32 v4, 1.0
879; GFX8-NEXT:    v_rcp_f32_e32 v1, v1
880; GFX8-NEXT:    v_rcp_f32_e32 v3, v3
881; GFX8-NEXT:    v_mul_f32_e32 v1, v4, v1
882; GFX8-NEXT:    v_mul_f32_e32 v3, v4, v3
883; GFX8-NEXT:    v_cvt_f16_f32_e32 v1, v1
884; GFX8-NEXT:    v_cvt_f16_f32_e32 v3, v3
885; GFX8-NEXT:    v_div_fixup_f16 v0, v1, v0, 1.0
886; GFX8-NEXT:    v_div_fixup_f16 v1, v3, v2, 1.0
887; GFX8-NEXT:    v_mov_b32_e32 v2, 16
888; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
889; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
890; GFX8-NEXT:    s_setpc_b64 s[30:31]
891;
892; GFX9-LABEL: v_rcp_v2f16_arcp:
893; GFX9:       ; %bb.0:
894; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
895; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
896; GFX9-NEXT:    v_cvt_f32_f16_e32 v1, v0
897; GFX9-NEXT:    v_cvt_f32_f16_e32 v3, v2
898; GFX9-NEXT:    v_cvt_f32_f16_e32 v4, 1.0
899; GFX9-NEXT:    v_rcp_f32_e32 v1, v1
900; GFX9-NEXT:    v_rcp_f32_e32 v3, v3
901; GFX9-NEXT:    v_mul_f32_e32 v1, v4, v1
902; GFX9-NEXT:    v_mul_f32_e32 v3, v4, v3
903; GFX9-NEXT:    v_cvt_f16_f32_e32 v1, v1
904; GFX9-NEXT:    v_cvt_f16_f32_e32 v3, v3
905; GFX9-NEXT:    v_div_fixup_f16 v0, v1, v0, 1.0
906; GFX9-NEXT:    v_div_fixup_f16 v1, v3, v2, 1.0
907; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffff
908; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
909; GFX9-NEXT:    v_and_or_b32 v0, v0, v2, v1
910; GFX9-NEXT:    s_setpc_b64 s[30:31]
911  %fdiv = fdiv arcp <2 x half> <half 1.0, half 1.0>, %x
912  ret <2 x half> %fdiv
913}
914
915define <2 x half> @v_rcp_v2f16_arcp_afn(<2 x half> %x) {
916; GFX6-LABEL: v_rcp_v2f16_arcp_afn:
917; GFX6:       ; %bb.0:
918; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
919; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
920; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
921; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, 1.0
922; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
923; GFX6-NEXT:    v_rcp_f32_e32 v1, v1
924; GFX6-NEXT:    v_mul_f32_e32 v0, v2, v0
925; GFX6-NEXT:    v_mul_f32_e32 v1, v2, v1
926; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
927; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
928; GFX6-NEXT:    s_setpc_b64 s[30:31]
929;
930; GFX8-LABEL: v_rcp_v2f16_arcp_afn:
931; GFX8:       ; %bb.0:
932; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
933; GFX8-NEXT:    v_rcp_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
934; GFX8-NEXT:    v_rcp_f16_e32 v0, v0
935; GFX8-NEXT:    v_mov_b32_e32 v2, 16
936; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
937; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
938; GFX8-NEXT:    s_setpc_b64 s[30:31]
939;
940; GFX9-LABEL: v_rcp_v2f16_arcp_afn:
941; GFX9:       ; %bb.0:
942; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
943; GFX9-NEXT:    v_rcp_f16_e32 v1, v0
944; GFX9-NEXT:    v_rcp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
945; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffff
946; GFX9-NEXT:    v_and_or_b32 v0, v1, v2, v0
947; GFX9-NEXT:    s_setpc_b64 s[30:31]
948  %fdiv = fdiv arcp afn <2 x half> <half 1.0, half 1.0>, %x
949  ret <2 x half> %fdiv
950}
951
952define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
953; GFX6-IEEE-LABEL: v_rcp_v2f16_ulp25:
954; GFX6-IEEE:       ; %bb.0:
955; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
956; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v2, 1.0
957; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, v0
958; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, v1
959; GFX6-IEEE-NEXT:    v_div_scale_f32 v3, s[4:5], v0, v0, v2
960; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v4, v3
961; GFX6-IEEE-NEXT:    v_div_scale_f32 v5, vcc, v2, v0, v2
962; GFX6-IEEE-NEXT:    v_fma_f32 v6, -v3, v4, 1.0
963; GFX6-IEEE-NEXT:    v_fma_f32 v4, v6, v4, v4
964; GFX6-IEEE-NEXT:    v_mul_f32_e32 v6, v5, v4
965; GFX6-IEEE-NEXT:    v_fma_f32 v7, -v3, v6, v5
966; GFX6-IEEE-NEXT:    v_fma_f32 v6, v7, v4, v6
967; GFX6-IEEE-NEXT:    v_fma_f32 v3, -v3, v6, v5
968; GFX6-IEEE-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
969; GFX6-IEEE-NEXT:    v_div_fixup_f32 v0, v3, v0, v2
970; GFX6-IEEE-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, v2
971; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v4, v3
972; GFX6-IEEE-NEXT:    v_div_scale_f32 v5, vcc, v2, v1, v2
973; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
974; GFX6-IEEE-NEXT:    v_fma_f32 v6, -v3, v4, 1.0
975; GFX6-IEEE-NEXT:    v_fma_f32 v4, v6, v4, v4
976; GFX6-IEEE-NEXT:    v_mul_f32_e32 v6, v5, v4
977; GFX6-IEEE-NEXT:    v_fma_f32 v7, -v3, v6, v5
978; GFX6-IEEE-NEXT:    v_fma_f32 v6, v7, v4, v6
979; GFX6-IEEE-NEXT:    v_fma_f32 v3, -v3, v6, v5
980; GFX6-IEEE-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
981; GFX6-IEEE-NEXT:    v_div_fixup_f32 v1, v3, v1, v2
982; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v1, v1
983; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
984;
985; GFX6-FLUSH-LABEL: v_rcp_v2f16_ulp25:
986; GFX6-FLUSH:       ; %bb.0:
987; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
988; GFX6-FLUSH-NEXT:    s_movk_i32 s6, 0x3c00
989; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, s6
990; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, v0
991; GFX6-FLUSH-NEXT:    v_div_scale_f32 v3, s[4:5], v0, v0, v2
992; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v4, v3
993; GFX6-FLUSH-NEXT:    v_div_scale_f32 v5, vcc, v2, v0, v2
994; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
995; GFX6-FLUSH-NEXT:    v_fma_f32 v6, -v3, v4, 1.0
996; GFX6-FLUSH-NEXT:    v_fma_f32 v4, v6, v4, v4
997; GFX6-FLUSH-NEXT:    v_mul_f32_e32 v6, v5, v4
998; GFX6-FLUSH-NEXT:    v_fma_f32 v7, -v3, v6, v5
999; GFX6-FLUSH-NEXT:    v_fma_f32 v6, v7, v4, v6
1000; GFX6-FLUSH-NEXT:    v_fma_f32 v3, -v3, v6, v5
1001; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
1002; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
1003; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v4, s6
1004; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, v1
1005; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v0, v3, v0, v2
1006; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
1007; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v0, v0
1008; GFX6-FLUSH-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v4
1009; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v3, v2
1010; GFX6-FLUSH-NEXT:    v_div_scale_f32 v5, vcc, v4, v1, v4
1011; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
1012; GFX6-FLUSH-NEXT:    v_fma_f32 v6, -v2, v3, 1.0
1013; GFX6-FLUSH-NEXT:    v_fma_f32 v3, v6, v3, v3
1014; GFX6-FLUSH-NEXT:    v_mul_f32_e32 v6, v5, v3
1015; GFX6-FLUSH-NEXT:    v_fma_f32 v7, -v2, v6, v5
1016; GFX6-FLUSH-NEXT:    v_fma_f32 v6, v7, v3, v6
1017; GFX6-FLUSH-NEXT:    v_fma_f32 v2, -v2, v6, v5
1018; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
1019; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v3, v6
1020; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v1, v2, v1, v4
1021; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v1, v1
1022; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
1023;
1024; GFX8-LABEL: v_rcp_v2f16_ulp25:
1025; GFX8:       ; %bb.0:
1026; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1027; GFX8-NEXT:    v_rcp_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1028; GFX8-NEXT:    v_rcp_f16_e32 v0, v0
1029; GFX8-NEXT:    v_mov_b32_e32 v2, 16
1030; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
1031; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1032; GFX8-NEXT:    s_setpc_b64 s[30:31]
1033;
1034; GFX9-LABEL: v_rcp_v2f16_ulp25:
1035; GFX9:       ; %bb.0:
1036; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1037; GFX9-NEXT:    v_rcp_f16_e32 v1, v0
1038; GFX9-NEXT:    v_rcp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
1039; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffff
1040; GFX9-NEXT:    v_and_or_b32 v0, v1, v2, v0
1041; GFX9-NEXT:    s_setpc_b64 s[30:31]
1042  %fdiv = fdiv <2 x half> <half 1.0, half 1.0>, %x, !fpmath !0
1043  ret <2 x half> %fdiv
1044}
1045
1046define <2 x half> @v_fdiv_v2f16_afn_ulp25(<2 x half> %a, <2 x half> %b) {
1047; GFX6-LABEL: v_fdiv_v2f16_afn_ulp25:
1048; GFX6:       ; %bb.0:
1049; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1050; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
1051; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
1052; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
1053; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
1054; GFX6-NEXT:    v_rcp_f32_e32 v2, v2
1055; GFX6-NEXT:    v_rcp_f32_e32 v3, v3
1056; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v2
1057; GFX6-NEXT:    v_mul_f32_e32 v1, v1, v3
1058; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
1059; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
1060; GFX6-NEXT:    s_setpc_b64 s[30:31]
1061;
1062; GFX8-LABEL: v_fdiv_v2f16_afn_ulp25:
1063; GFX8:       ; %bb.0:
1064; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1065; GFX8-NEXT:    v_rcp_f16_e32 v2, v1
1066; GFX8-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1067; GFX8-NEXT:    v_mul_f16_e32 v2, v0, v2
1068; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1069; GFX8-NEXT:    v_mov_b32_e32 v1, 16
1070; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
1071; GFX8-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1072; GFX8-NEXT:    s_setpc_b64 s[30:31]
1073;
1074; GFX9-LABEL: v_fdiv_v2f16_afn_ulp25:
1075; GFX9:       ; %bb.0:
1076; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1077; GFX9-NEXT:    v_rcp_f16_e32 v2, v1
1078; GFX9-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1079; GFX9-NEXT:    v_mul_f16_e32 v2, v0, v2
1080; GFX9-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1081; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffff
1082; GFX9-NEXT:    v_and_or_b32 v0, v2, v1, v0
1083; GFX9-NEXT:    s_setpc_b64 s[30:31]
1084  %fdiv = fdiv afn <2 x half> %a, %b, !fpmath !0
1085  ret <2 x half> %fdiv
1086}
1087
1088define <2 x half> @v_fdiv_v2f16_arcp_ulp25(<2 x half> %a, <2 x half> %b) {
1089; GFX6-IEEE-LABEL: v_fdiv_v2f16_arcp_ulp25:
1090; GFX6-IEEE:       ; %bb.0:
1091; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1092; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, v0
1093; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v2, v2
1094; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, v1
1095; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v3, v3
1096; GFX6-IEEE-NEXT:    v_div_scale_f32 v4, s[4:5], v2, v2, v0
1097; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v5, v4
1098; GFX6-IEEE-NEXT:    v_div_scale_f32 v6, vcc, v0, v2, v0
1099; GFX6-IEEE-NEXT:    v_fma_f32 v7, -v4, v5, 1.0
1100; GFX6-IEEE-NEXT:    v_fma_f32 v5, v7, v5, v5
1101; GFX6-IEEE-NEXT:    v_mul_f32_e32 v7, v6, v5
1102; GFX6-IEEE-NEXT:    v_fma_f32 v8, -v4, v7, v6
1103; GFX6-IEEE-NEXT:    v_fma_f32 v7, v8, v5, v7
1104; GFX6-IEEE-NEXT:    v_fma_f32 v4, -v4, v7, v6
1105; GFX6-IEEE-NEXT:    v_div_fmas_f32 v4, v4, v5, v7
1106; GFX6-IEEE-NEXT:    v_div_fixup_f32 v0, v4, v2, v0
1107; GFX6-IEEE-NEXT:    v_div_scale_f32 v2, s[4:5], v3, v3, v1
1108; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v4, v2
1109; GFX6-IEEE-NEXT:    v_div_scale_f32 v5, vcc, v1, v3, v1
1110; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
1111; GFX6-IEEE-NEXT:    v_fma_f32 v6, -v2, v4, 1.0
1112; GFX6-IEEE-NEXT:    v_fma_f32 v4, v6, v4, v4
1113; GFX6-IEEE-NEXT:    v_mul_f32_e32 v6, v5, v4
1114; GFX6-IEEE-NEXT:    v_fma_f32 v7, -v2, v6, v5
1115; GFX6-IEEE-NEXT:    v_fma_f32 v6, v7, v4, v6
1116; GFX6-IEEE-NEXT:    v_fma_f32 v2, -v2, v6, v5
1117; GFX6-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v4, v6
1118; GFX6-IEEE-NEXT:    v_div_fixup_f32 v1, v2, v3, v1
1119; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v1, v1
1120; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
1121;
1122; GFX6-FLUSH-LABEL: v_fdiv_v2f16_arcp_ulp25:
1123; GFX6-FLUSH:       ; %bb.0:
1124; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1125; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, v0
1126; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, v2
1127; GFX6-FLUSH-NEXT:    v_div_scale_f32 v4, s[4:5], v2, v2, v0
1128; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v5, v4
1129; GFX6-FLUSH-NEXT:    v_div_scale_f32 v6, vcc, v0, v2, v0
1130; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
1131; GFX6-FLUSH-NEXT:    v_fma_f32 v7, -v4, v5, 1.0
1132; GFX6-FLUSH-NEXT:    v_fma_f32 v5, v7, v5, v5
1133; GFX6-FLUSH-NEXT:    v_mul_f32_e32 v7, v6, v5
1134; GFX6-FLUSH-NEXT:    v_fma_f32 v8, -v4, v7, v6
1135; GFX6-FLUSH-NEXT:    v_fma_f32 v7, v8, v5, v7
1136; GFX6-FLUSH-NEXT:    v_fma_f32 v4, -v4, v7, v6
1137; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
1138; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, v1
1139; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v3, v3
1140; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v4, v4, v5, v7
1141; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v0, v4, v2, v0
1142; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
1143; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v0, v0
1144; GFX6-FLUSH-NEXT:    v_div_scale_f32 v2, s[4:5], v3, v3, v1
1145; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v4, v2
1146; GFX6-FLUSH-NEXT:    v_div_scale_f32 v5, vcc, v1, v3, v1
1147; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
1148; GFX6-FLUSH-NEXT:    v_fma_f32 v6, -v2, v4, 1.0
1149; GFX6-FLUSH-NEXT:    v_fma_f32 v4, v6, v4, v4
1150; GFX6-FLUSH-NEXT:    v_mul_f32_e32 v6, v5, v4
1151; GFX6-FLUSH-NEXT:    v_fma_f32 v7, -v2, v6, v5
1152; GFX6-FLUSH-NEXT:    v_fma_f32 v6, v7, v4, v6
1153; GFX6-FLUSH-NEXT:    v_fma_f32 v2, -v2, v6, v5
1154; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
1155; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v4, v6
1156; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v1, v2, v3, v1
1157; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v1, v1
1158; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
1159;
1160; GFX8-LABEL: v_fdiv_v2f16_arcp_ulp25:
1161; GFX8:       ; %bb.0:
1162; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1163; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
1164; GFX8-NEXT:    v_cvt_f32_f16_e32 v2, v1
1165; GFX8-NEXT:    v_cvt_f32_f16_e32 v5, v4
1166; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
1167; GFX8-NEXT:    v_cvt_f32_f16_e32 v3, v0
1168; GFX8-NEXT:    v_rcp_f32_e32 v2, v2
1169; GFX8-NEXT:    v_cvt_f32_f16_e32 v7, v6
1170; GFX8-NEXT:    v_rcp_f32_e32 v5, v5
1171; GFX8-NEXT:    v_mul_f32_e32 v2, v3, v2
1172; GFX8-NEXT:    v_cvt_f16_f32_e32 v2, v2
1173; GFX8-NEXT:    v_mul_f32_e32 v3, v7, v5
1174; GFX8-NEXT:    v_cvt_f16_f32_e32 v3, v3
1175; GFX8-NEXT:    v_div_fixup_f16 v0, v2, v1, v0
1176; GFX8-NEXT:    v_mov_b32_e32 v2, 16
1177; GFX8-NEXT:    v_div_fixup_f16 v1, v3, v4, v6
1178; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
1179; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1180; GFX8-NEXT:    s_setpc_b64 s[30:31]
1181;
1182; GFX9-LABEL: v_fdiv_v2f16_arcp_ulp25:
1183; GFX9:       ; %bb.0:
1184; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1185; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
1186; GFX9-NEXT:    v_cvt_f32_f16_e32 v2, v1
1187; GFX9-NEXT:    v_cvt_f32_f16_e32 v5, v4
1188; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
1189; GFX9-NEXT:    v_cvt_f32_f16_e32 v3, v0
1190; GFX9-NEXT:    v_rcp_f32_e32 v2, v2
1191; GFX9-NEXT:    v_cvt_f32_f16_e32 v7, v6
1192; GFX9-NEXT:    v_rcp_f32_e32 v5, v5
1193; GFX9-NEXT:    v_mul_f32_e32 v2, v3, v2
1194; GFX9-NEXT:    v_cvt_f16_f32_e32 v2, v2
1195; GFX9-NEXT:    v_mul_f32_e32 v3, v7, v5
1196; GFX9-NEXT:    v_cvt_f16_f32_e32 v3, v3
1197; GFX9-NEXT:    v_div_fixup_f16 v0, v2, v1, v0
1198; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffff
1199; GFX9-NEXT:    v_div_fixup_f16 v1, v3, v4, v6
1200; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1201; GFX9-NEXT:    v_and_or_b32 v0, v0, v2, v1
1202; GFX9-NEXT:    s_setpc_b64 s[30:31]
1203  %fdiv = fdiv arcp <2 x half> %a, %b, !fpmath !0
1204  ret <2 x half> %fdiv
1205}
1206
1207define <2 x half> @v_fdiv_v2f16_arcp_afn_ulp25(<2 x half> %a, <2 x half> %b) {
1208; GFX6-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
1209; GFX6:       ; %bb.0:
1210; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1211; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
1212; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
1213; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
1214; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
1215; GFX6-NEXT:    v_rcp_f32_e32 v2, v2
1216; GFX6-NEXT:    v_rcp_f32_e32 v3, v3
1217; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v2
1218; GFX6-NEXT:    v_mul_f32_e32 v1, v1, v3
1219; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
1220; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
1221; GFX6-NEXT:    s_setpc_b64 s[30:31]
1222;
1223; GFX8-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
1224; GFX8:       ; %bb.0:
1225; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1226; GFX8-NEXT:    v_rcp_f16_e32 v2, v1
1227; GFX8-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1228; GFX8-NEXT:    v_mul_f16_e32 v2, v0, v2
1229; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1230; GFX8-NEXT:    v_mov_b32_e32 v1, 16
1231; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
1232; GFX8-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1233; GFX8-NEXT:    s_setpc_b64 s[30:31]
1234;
1235; GFX9-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
1236; GFX9:       ; %bb.0:
1237; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1238; GFX9-NEXT:    v_rcp_f16_e32 v2, v1
1239; GFX9-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1240; GFX9-NEXT:    v_mul_f16_e32 v2, v0, v2
1241; GFX9-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1242; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffff
1243; GFX9-NEXT:    v_and_or_b32 v0, v2, v1, v0
1244; GFX9-NEXT:    s_setpc_b64 s[30:31]
1245  %fdiv = fdiv afn arcp <2 x half> %a, %b, !fpmath !0
1246  ret <2 x half> %fdiv
1247}
1248
1249!0 = !{float 2.500000e+00}
1250