1*5a02ffc3SAndrew Turner /*
2*5a02ffc3SAndrew Turner  * Single-precision SVE powf function.
3*5a02ffc3SAndrew Turner  *
4*5a02ffc3SAndrew Turner  * Copyright (c) 2023, Arm Limited.
5*5a02ffc3SAndrew Turner  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6*5a02ffc3SAndrew Turner  */
7*5a02ffc3SAndrew Turner 
8*5a02ffc3SAndrew Turner #include "sv_math.h"
9*5a02ffc3SAndrew Turner #include "pl_sig.h"
10*5a02ffc3SAndrew Turner #include "pl_test.h"
11*5a02ffc3SAndrew Turner 
12*5a02ffc3SAndrew Turner /* The following data is used in the SVE pow core computation
13*5a02ffc3SAndrew Turner    and special case detection.  */
14*5a02ffc3SAndrew Turner #define Tinvc __v_powf_data.invc
15*5a02ffc3SAndrew Turner #define Tlogc __v_powf_data.logc
16*5a02ffc3SAndrew Turner #define Texp __v_powf_data.scale
17*5a02ffc3SAndrew Turner #define SignBias (1 << (V_POWF_EXP2_TABLE_BITS + 11))
18*5a02ffc3SAndrew Turner #define Shift 0x1.8p52
19*5a02ffc3SAndrew Turner #define Norm 0x1p23f /* 0x4b000000.  */
20*5a02ffc3SAndrew Turner 
21*5a02ffc3SAndrew Turner /* Overall ULP error bound for pow is 2.6 ulp
22*5a02ffc3SAndrew Turner    ~ 0.5 + 2^24 (128*Ln2*relerr_log2 + relerr_exp2).  */
23*5a02ffc3SAndrew Turner static const struct data
24*5a02ffc3SAndrew Turner {
25*5a02ffc3SAndrew Turner   double log_poly[4];
26*5a02ffc3SAndrew Turner   double exp_poly[3];
27*5a02ffc3SAndrew Turner   float uflow_bound, oflow_bound, small_bound;
28*5a02ffc3SAndrew Turner   uint32_t sign_bias, sign_mask, subnormal_bias, off;
29*5a02ffc3SAndrew Turner } data = {
30*5a02ffc3SAndrew Turner   /* rel err: 1.5 * 2^-30. Each coefficients is multiplied the value of
31*5a02ffc3SAndrew Turner      V_POWF_EXP2_N.  */
32*5a02ffc3SAndrew Turner   .log_poly = { -0x1.6ff5daa3b3d7cp+3, 0x1.ec81d03c01aebp+3,
33*5a02ffc3SAndrew Turner 		-0x1.71547bb43f101p+4, 0x1.7154764a815cbp+5 },
34*5a02ffc3SAndrew Turner   /* rel err: 1.69 * 2^-34.  */
35*5a02ffc3SAndrew Turner   .exp_poly = {
36*5a02ffc3SAndrew Turner     0x1.c6af84b912394p-20, /* A0 / V_POWF_EXP2_N^3.  */
37*5a02ffc3SAndrew Turner     0x1.ebfce50fac4f3p-13, /* A1 / V_POWF_EXP2_N^2.  */
38*5a02ffc3SAndrew Turner     0x1.62e42ff0c52d6p-6,   /* A3 / V_POWF_EXP2_N.  */
39*5a02ffc3SAndrew Turner   },
40*5a02ffc3SAndrew Turner   .uflow_bound = -0x1.2cp+12f, /* -150.0 * V_POWF_EXP2_N.  */
41*5a02ffc3SAndrew Turner   .oflow_bound = 0x1p+12f, /* 128.0 * V_POWF_EXP2_N.  */
42*5a02ffc3SAndrew Turner   .small_bound = 0x1p-126f,
43*5a02ffc3SAndrew Turner   .off = 0x3f35d000,
44*5a02ffc3SAndrew Turner   .sign_bias = SignBias,
45*5a02ffc3SAndrew Turner   .sign_mask = 0x80000000,
46*5a02ffc3SAndrew Turner   .subnormal_bias = 0x0b800000, /* 23 << 23.  */
47*5a02ffc3SAndrew Turner };
48*5a02ffc3SAndrew Turner 
49*5a02ffc3SAndrew Turner #define A(i) sv_f64 (d->log_poly[i])
50*5a02ffc3SAndrew Turner #define C(i) sv_f64 (d->exp_poly[i])
51*5a02ffc3SAndrew Turner 
52*5a02ffc3SAndrew Turner /* Check if x is an integer.  */
53*5a02ffc3SAndrew Turner static inline svbool_t
svisint(svbool_t pg,svfloat32_t x)54*5a02ffc3SAndrew Turner svisint (svbool_t pg, svfloat32_t x)
55*5a02ffc3SAndrew Turner {
56*5a02ffc3SAndrew Turner   return svcmpeq (pg, svrintz_z (pg, x), x);
57*5a02ffc3SAndrew Turner }
58*5a02ffc3SAndrew Turner 
59*5a02ffc3SAndrew Turner /* Check if x is real not integer valued.  */
60*5a02ffc3SAndrew Turner static inline svbool_t
svisnotint(svbool_t pg,svfloat32_t x)61*5a02ffc3SAndrew Turner svisnotint (svbool_t pg, svfloat32_t x)
62*5a02ffc3SAndrew Turner {
63*5a02ffc3SAndrew Turner   return svcmpne (pg, svrintz_z (pg, x), x);
64*5a02ffc3SAndrew Turner }
65*5a02ffc3SAndrew Turner 
66*5a02ffc3SAndrew Turner /* Check if x is an odd integer.  */
67*5a02ffc3SAndrew Turner static inline svbool_t
svisodd(svbool_t pg,svfloat32_t x)68*5a02ffc3SAndrew Turner svisodd (svbool_t pg, svfloat32_t x)
69*5a02ffc3SAndrew Turner {
70*5a02ffc3SAndrew Turner   svfloat32_t y = svmul_x (pg, x, 0.5f);
71*5a02ffc3SAndrew Turner   return svisnotint (pg, y);
72*5a02ffc3SAndrew Turner }
73*5a02ffc3SAndrew Turner 
74*5a02ffc3SAndrew Turner /* Check if zero, inf or nan.  */
75*5a02ffc3SAndrew Turner static inline svbool_t
sv_zeroinfnan(svbool_t pg,svuint32_t i)76*5a02ffc3SAndrew Turner sv_zeroinfnan (svbool_t pg, svuint32_t i)
77*5a02ffc3SAndrew Turner {
78*5a02ffc3SAndrew Turner   return svcmpge (pg, svsub_x (pg, svmul_x (pg, i, 2u), 1),
79*5a02ffc3SAndrew Turner 		  2u * 0x7f800000 - 1);
80*5a02ffc3SAndrew Turner }
81*5a02ffc3SAndrew Turner 
82*5a02ffc3SAndrew Turner /* Returns 0 if not int, 1 if odd int, 2 if even int.  The argument is
83*5a02ffc3SAndrew Turner    the bit representation of a non-zero finite floating-point value.  */
84*5a02ffc3SAndrew Turner static inline int
checkint(uint32_t iy)85*5a02ffc3SAndrew Turner checkint (uint32_t iy)
86*5a02ffc3SAndrew Turner {
87*5a02ffc3SAndrew Turner   int e = iy >> 23 & 0xff;
88*5a02ffc3SAndrew Turner   if (e < 0x7f)
89*5a02ffc3SAndrew Turner     return 0;
90*5a02ffc3SAndrew Turner   if (e > 0x7f + 23)
91*5a02ffc3SAndrew Turner     return 2;
92*5a02ffc3SAndrew Turner   if (iy & ((1 << (0x7f + 23 - e)) - 1))
93*5a02ffc3SAndrew Turner     return 0;
94*5a02ffc3SAndrew Turner   if (iy & (1 << (0x7f + 23 - e)))
95*5a02ffc3SAndrew Turner     return 1;
96*5a02ffc3SAndrew Turner   return 2;
97*5a02ffc3SAndrew Turner }
98*5a02ffc3SAndrew Turner 
99*5a02ffc3SAndrew Turner /* Check if zero, inf or nan.  */
100*5a02ffc3SAndrew Turner static inline int
zeroinfnan(uint32_t ix)101*5a02ffc3SAndrew Turner zeroinfnan (uint32_t ix)
102*5a02ffc3SAndrew Turner {
103*5a02ffc3SAndrew Turner   return 2 * ix - 1 >= 2u * 0x7f800000 - 1;
104*5a02ffc3SAndrew Turner }
105*5a02ffc3SAndrew Turner 
106*5a02ffc3SAndrew Turner /* A scalar subroutine used to fix main power special cases. Similar to the
107*5a02ffc3SAndrew Turner    preamble of finite_powf except that we do not update ix and sign_bias. This
108*5a02ffc3SAndrew Turner    is done in the preamble of the SVE powf.  */
109*5a02ffc3SAndrew Turner static inline float
powf_specialcase(float x,float y,float z)110*5a02ffc3SAndrew Turner powf_specialcase (float x, float y, float z)
111*5a02ffc3SAndrew Turner {
112*5a02ffc3SAndrew Turner   uint32_t ix = asuint (x);
113*5a02ffc3SAndrew Turner   uint32_t iy = asuint (y);
114*5a02ffc3SAndrew Turner   /* Either (x < 0x1p-126 or inf or nan) or (y is 0 or inf or nan).  */
115*5a02ffc3SAndrew Turner   if (unlikely (zeroinfnan (iy)))
116*5a02ffc3SAndrew Turner     {
117*5a02ffc3SAndrew Turner       if (2 * iy == 0)
118*5a02ffc3SAndrew Turner 	return issignalingf_inline (x) ? x + y : 1.0f;
119*5a02ffc3SAndrew Turner       if (ix == 0x3f800000)
120*5a02ffc3SAndrew Turner 	return issignalingf_inline (y) ? x + y : 1.0f;
121*5a02ffc3SAndrew Turner       if (2 * ix > 2u * 0x7f800000 || 2 * iy > 2u * 0x7f800000)
122*5a02ffc3SAndrew Turner 	return x + y;
123*5a02ffc3SAndrew Turner       if (2 * ix == 2 * 0x3f800000)
124*5a02ffc3SAndrew Turner 	return 1.0f;
125*5a02ffc3SAndrew Turner       if ((2 * ix < 2 * 0x3f800000) == !(iy & 0x80000000))
126*5a02ffc3SAndrew Turner 	return 0.0f; /* |x|<1 && y==inf or |x|>1 && y==-inf.  */
127*5a02ffc3SAndrew Turner       return y * y;
128*5a02ffc3SAndrew Turner     }
129*5a02ffc3SAndrew Turner   if (unlikely (zeroinfnan (ix)))
130*5a02ffc3SAndrew Turner     {
131*5a02ffc3SAndrew Turner       float_t x2 = x * x;
132*5a02ffc3SAndrew Turner       if (ix & 0x80000000 && checkint (iy) == 1)
133*5a02ffc3SAndrew Turner 	x2 = -x2;
134*5a02ffc3SAndrew Turner       return iy & 0x80000000 ? 1 / x2 : x2;
135*5a02ffc3SAndrew Turner     }
136*5a02ffc3SAndrew Turner   /* We need a return here in case x<0 and y is integer, but all other tests
137*5a02ffc3SAndrew Turner    need to be run.  */
138*5a02ffc3SAndrew Turner   return z;
139*5a02ffc3SAndrew Turner }
140*5a02ffc3SAndrew Turner 
141*5a02ffc3SAndrew Turner /* Scalar fallback for special case routines with custom signature.  */
142*5a02ffc3SAndrew Turner static inline svfloat32_t
sv_call_powf_sc(svfloat32_t x1,svfloat32_t x2,svfloat32_t y,svbool_t cmp)143*5a02ffc3SAndrew Turner sv_call_powf_sc (svfloat32_t x1, svfloat32_t x2, svfloat32_t y, svbool_t cmp)
144*5a02ffc3SAndrew Turner {
145*5a02ffc3SAndrew Turner   svbool_t p = svpfirst (cmp, svpfalse ());
146*5a02ffc3SAndrew Turner   while (svptest_any (cmp, p))
147*5a02ffc3SAndrew Turner     {
148*5a02ffc3SAndrew Turner       float sx1 = svclastb (p, 0, x1);
149*5a02ffc3SAndrew Turner       float sx2 = svclastb (p, 0, x2);
150*5a02ffc3SAndrew Turner       float elem = svclastb (p, 0, y);
151*5a02ffc3SAndrew Turner       elem = powf_specialcase (sx1, sx2, elem);
152*5a02ffc3SAndrew Turner       svfloat32_t y2 = sv_f32 (elem);
153*5a02ffc3SAndrew Turner       y = svsel (p, y2, y);
154*5a02ffc3SAndrew Turner       p = svpnext_b32 (cmp, p);
155*5a02ffc3SAndrew Turner     }
156*5a02ffc3SAndrew Turner   return y;
157*5a02ffc3SAndrew Turner }
158*5a02ffc3SAndrew Turner 
159*5a02ffc3SAndrew Turner /* Compute core for half of the lanes in double precision.  */
160*5a02ffc3SAndrew Turner static inline svfloat64_t
sv_powf_core_ext(const svbool_t pg,svuint64_t i,svfloat64_t z,svint64_t k,svfloat64_t y,svuint64_t sign_bias,svfloat64_t * pylogx,const struct data * d)161*5a02ffc3SAndrew Turner sv_powf_core_ext (const svbool_t pg, svuint64_t i, svfloat64_t z, svint64_t k,
162*5a02ffc3SAndrew Turner 		  svfloat64_t y, svuint64_t sign_bias, svfloat64_t *pylogx,
163*5a02ffc3SAndrew Turner 		  const struct data *d)
164*5a02ffc3SAndrew Turner {
165*5a02ffc3SAndrew Turner   svfloat64_t invc = svld1_gather_index (pg, Tinvc, i);
166*5a02ffc3SAndrew Turner   svfloat64_t logc = svld1_gather_index (pg, Tlogc, i);
167*5a02ffc3SAndrew Turner 
168*5a02ffc3SAndrew Turner   /* log2(x) = log1p(z/c-1)/ln2 + log2(c) + k.  */
169*5a02ffc3SAndrew Turner   svfloat64_t r = svmla_x (pg, sv_f64 (-1.0), z, invc);
170*5a02ffc3SAndrew Turner   svfloat64_t y0 = svadd_x (pg, logc, svcvt_f64_x (pg, k));
171*5a02ffc3SAndrew Turner 
172*5a02ffc3SAndrew Turner   /* Polynomial to approximate log1p(r)/ln2.  */
173*5a02ffc3SAndrew Turner   svfloat64_t logx = A (0);
174*5a02ffc3SAndrew Turner   logx = svmla_x (pg, A (1), r, logx);
175*5a02ffc3SAndrew Turner   logx = svmla_x (pg, A (2), r, logx);
176*5a02ffc3SAndrew Turner   logx = svmla_x (pg, A (3), r, logx);
177*5a02ffc3SAndrew Turner   logx = svmla_x (pg, y0, r, logx);
178*5a02ffc3SAndrew Turner   *pylogx = svmul_x (pg, y, logx);
179*5a02ffc3SAndrew Turner 
180*5a02ffc3SAndrew Turner   /* z - kd is in [-1, 1] in non-nearest rounding modes.  */
181*5a02ffc3SAndrew Turner   svfloat64_t kd = svadd_x (pg, *pylogx, Shift);
182*5a02ffc3SAndrew Turner   svuint64_t ki = svreinterpret_u64 (kd);
183*5a02ffc3SAndrew Turner   kd = svsub_x (pg, kd, Shift);
184*5a02ffc3SAndrew Turner 
185*5a02ffc3SAndrew Turner   r = svsub_x (pg, *pylogx, kd);
186*5a02ffc3SAndrew Turner 
187*5a02ffc3SAndrew Turner   /* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1).  */
188*5a02ffc3SAndrew Turner   svuint64_t t
189*5a02ffc3SAndrew Turner       = svld1_gather_index (pg, Texp, svand_x (pg, ki, V_POWF_EXP2_N - 1));
190*5a02ffc3SAndrew Turner   svuint64_t ski = svadd_x (pg, ki, sign_bias);
191*5a02ffc3SAndrew Turner   t = svadd_x (pg, t, svlsl_x (pg, ski, 52 - V_POWF_EXP2_TABLE_BITS));
192*5a02ffc3SAndrew Turner   svfloat64_t s = svreinterpret_f64 (t);
193*5a02ffc3SAndrew Turner 
194*5a02ffc3SAndrew Turner   svfloat64_t p = C (0);
195*5a02ffc3SAndrew Turner   p = svmla_x (pg, C (1), p, r);
196*5a02ffc3SAndrew Turner   p = svmla_x (pg, C (2), p, r);
197*5a02ffc3SAndrew Turner   p = svmla_x (pg, s, p, svmul_x (pg, s, r));
198*5a02ffc3SAndrew Turner 
199*5a02ffc3SAndrew Turner   return p;
200*5a02ffc3SAndrew Turner }
201*5a02ffc3SAndrew Turner 
202*5a02ffc3SAndrew Turner /* Widen vector to double precision and compute core on both halves of the
203*5a02ffc3SAndrew Turner    vector. Lower cost of promotion by considering all lanes active.  */
204*5a02ffc3SAndrew Turner static inline svfloat32_t
sv_powf_core(const svbool_t pg,svuint32_t i,svuint32_t iz,svint32_t k,svfloat32_t y,svuint32_t sign_bias,svfloat32_t * pylogx,const struct data * d)205*5a02ffc3SAndrew Turner sv_powf_core (const svbool_t pg, svuint32_t i, svuint32_t iz, svint32_t k,
206*5a02ffc3SAndrew Turner 	      svfloat32_t y, svuint32_t sign_bias, svfloat32_t *pylogx,
207*5a02ffc3SAndrew Turner 	      const struct data *d)
208*5a02ffc3SAndrew Turner {
209*5a02ffc3SAndrew Turner   const svbool_t ptrue = svptrue_b64 ();
210*5a02ffc3SAndrew Turner 
211*5a02ffc3SAndrew Turner   /* Unpack and promote input vectors (pg, y, z, i, k and sign_bias) into two in
212*5a02ffc3SAndrew Turner      order to perform core computation in double precision.  */
213*5a02ffc3SAndrew Turner   const svbool_t pg_lo = svunpklo (pg);
214*5a02ffc3SAndrew Turner   const svbool_t pg_hi = svunpkhi (pg);
215*5a02ffc3SAndrew Turner   svfloat64_t y_lo = svcvt_f64_x (
216*5a02ffc3SAndrew Turner       ptrue, svreinterpret_f32 (svunpklo (svreinterpret_u32 (y))));
217*5a02ffc3SAndrew Turner   svfloat64_t y_hi = svcvt_f64_x (
218*5a02ffc3SAndrew Turner       ptrue, svreinterpret_f32 (svunpkhi (svreinterpret_u32 (y))));
219*5a02ffc3SAndrew Turner   svfloat32_t z = svreinterpret_f32 (iz);
220*5a02ffc3SAndrew Turner   svfloat64_t z_lo = svcvt_f64_x (
221*5a02ffc3SAndrew Turner       ptrue, svreinterpret_f32 (svunpklo (svreinterpret_u32 (z))));
222*5a02ffc3SAndrew Turner   svfloat64_t z_hi = svcvt_f64_x (
223*5a02ffc3SAndrew Turner       ptrue, svreinterpret_f32 (svunpkhi (svreinterpret_u32 (z))));
224*5a02ffc3SAndrew Turner   svuint64_t i_lo = svunpklo (i);
225*5a02ffc3SAndrew Turner   svuint64_t i_hi = svunpkhi (i);
226*5a02ffc3SAndrew Turner   svint64_t k_lo = svunpklo (k);
227*5a02ffc3SAndrew Turner   svint64_t k_hi = svunpkhi (k);
228*5a02ffc3SAndrew Turner   svuint64_t sign_bias_lo = svunpklo (sign_bias);
229*5a02ffc3SAndrew Turner   svuint64_t sign_bias_hi = svunpkhi (sign_bias);
230*5a02ffc3SAndrew Turner 
231*5a02ffc3SAndrew Turner   /* Compute each part in double precision.  */
232*5a02ffc3SAndrew Turner   svfloat64_t ylogx_lo, ylogx_hi;
233*5a02ffc3SAndrew Turner   svfloat64_t lo = sv_powf_core_ext (pg_lo, i_lo, z_lo, k_lo, y_lo,
234*5a02ffc3SAndrew Turner 				     sign_bias_lo, &ylogx_lo, d);
235*5a02ffc3SAndrew Turner   svfloat64_t hi = sv_powf_core_ext (pg_hi, i_hi, z_hi, k_hi, y_hi,
236*5a02ffc3SAndrew Turner 				     sign_bias_hi, &ylogx_hi, d);
237*5a02ffc3SAndrew Turner 
238*5a02ffc3SAndrew Turner   /* Convert back to single-precision and interleave.  */
239*5a02ffc3SAndrew Turner   svfloat32_t ylogx_lo_32 = svcvt_f32_x (ptrue, ylogx_lo);
240*5a02ffc3SAndrew Turner   svfloat32_t ylogx_hi_32 = svcvt_f32_x (ptrue, ylogx_hi);
241*5a02ffc3SAndrew Turner   *pylogx = svuzp1 (ylogx_lo_32, ylogx_hi_32);
242*5a02ffc3SAndrew Turner   svfloat32_t lo_32 = svcvt_f32_x (ptrue, lo);
243*5a02ffc3SAndrew Turner   svfloat32_t hi_32 = svcvt_f32_x (ptrue, hi);
244*5a02ffc3SAndrew Turner   return svuzp1 (lo_32, hi_32);
245*5a02ffc3SAndrew Turner }
246*5a02ffc3SAndrew Turner 
247*5a02ffc3SAndrew Turner /* Implementation of SVE powf.
248*5a02ffc3SAndrew Turner    Provides the same accuracy as AdvSIMD powf, since it relies on the same
249*5a02ffc3SAndrew Turner    algorithm. The theoretical maximum error is under 2.60 ULPs.
250*5a02ffc3SAndrew Turner    Maximum measured error is 2.56 ULPs:
251*5a02ffc3SAndrew Turner    SV_NAME_F2 (pow) (0x1.004118p+0, 0x1.5d14a4p+16) got 0x1.fd4bp+127
252*5a02ffc3SAndrew Turner 						   want 0x1.fd4b06p+127.  */
SV_NAME_F2(pow)253*5a02ffc3SAndrew Turner svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg)
254*5a02ffc3SAndrew Turner {
255*5a02ffc3SAndrew Turner   const struct data *d = ptr_barrier (&data);
256*5a02ffc3SAndrew Turner 
257*5a02ffc3SAndrew Turner   svuint32_t vix0 = svreinterpret_u32 (x);
258*5a02ffc3SAndrew Turner   svuint32_t viy0 = svreinterpret_u32 (y);
259*5a02ffc3SAndrew Turner 
260*5a02ffc3SAndrew Turner   /* Negative x cases.  */
261*5a02ffc3SAndrew Turner   svuint32_t sign_bit = svand_m (pg, vix0, d->sign_mask);
262*5a02ffc3SAndrew Turner   svbool_t xisneg = svcmpeq (pg, sign_bit, d->sign_mask);
263*5a02ffc3SAndrew Turner 
264*5a02ffc3SAndrew Turner   /* Set sign_bias and ix depending on sign of x and nature of y.  */
265*5a02ffc3SAndrew Turner   svbool_t yisnotint_xisneg = svpfalse_b ();
266*5a02ffc3SAndrew Turner   svuint32_t sign_bias = sv_u32 (0);
267*5a02ffc3SAndrew Turner   svuint32_t vix = vix0;
268*5a02ffc3SAndrew Turner   if (unlikely (svptest_any (pg, xisneg)))
269*5a02ffc3SAndrew Turner     {
270*5a02ffc3SAndrew Turner       /* Determine nature of y.  */
271*5a02ffc3SAndrew Turner       yisnotint_xisneg = svisnotint (xisneg, y);
272*5a02ffc3SAndrew Turner       svbool_t yisint_xisneg = svisint (xisneg, y);
273*5a02ffc3SAndrew Turner       svbool_t yisodd_xisneg = svisodd (xisneg, y);
274*5a02ffc3SAndrew Turner       /* ix set to abs(ix) if y is integer.  */
275*5a02ffc3SAndrew Turner       vix = svand_m (yisint_xisneg, vix0, 0x7fffffff);
276*5a02ffc3SAndrew Turner       /* Set to SignBias if x is negative and y is odd.  */
277*5a02ffc3SAndrew Turner       sign_bias = svsel (yisodd_xisneg, sv_u32 (d->sign_bias), sv_u32 (0));
278*5a02ffc3SAndrew Turner     }
279*5a02ffc3SAndrew Turner 
280*5a02ffc3SAndrew Turner   /* Special cases of x or y: zero, inf and nan.  */
281*5a02ffc3SAndrew Turner   svbool_t xspecial = sv_zeroinfnan (pg, vix0);
282*5a02ffc3SAndrew Turner   svbool_t yspecial = sv_zeroinfnan (pg, viy0);
283*5a02ffc3SAndrew Turner   svbool_t cmp = svorr_z (pg, xspecial, yspecial);
284*5a02ffc3SAndrew Turner 
285*5a02ffc3SAndrew Turner   /* Small cases of x: |x| < 0x1p-126.  */
286*5a02ffc3SAndrew Turner   svbool_t xsmall = svaclt (pg, x, d->small_bound);
287*5a02ffc3SAndrew Turner   if (unlikely (svptest_any (pg, xsmall)))
288*5a02ffc3SAndrew Turner     {
289*5a02ffc3SAndrew Turner       /* Normalize subnormal x so exponent becomes negative.  */
290*5a02ffc3SAndrew Turner       svuint32_t vix_norm = svreinterpret_u32 (svmul_x (xsmall, x, Norm));
291*5a02ffc3SAndrew Turner       vix_norm = svand_x (xsmall, vix_norm, 0x7fffffff);
292*5a02ffc3SAndrew Turner       vix_norm = svsub_x (xsmall, vix_norm, d->subnormal_bias);
293*5a02ffc3SAndrew Turner       vix = svsel (xsmall, vix_norm, vix);
294*5a02ffc3SAndrew Turner     }
295*5a02ffc3SAndrew Turner   /* Part of core computation carried in working precision.  */
296*5a02ffc3SAndrew Turner   svuint32_t tmp = svsub_x (pg, vix, d->off);
297*5a02ffc3SAndrew Turner   svuint32_t i = svand_x (pg, svlsr_x (pg, tmp, (23 - V_POWF_LOG2_TABLE_BITS)),
298*5a02ffc3SAndrew Turner 			  V_POWF_LOG2_N - 1);
299*5a02ffc3SAndrew Turner   svuint32_t top = svand_x (pg, tmp, 0xff800000);
300*5a02ffc3SAndrew Turner   svuint32_t iz = svsub_x (pg, vix, top);
301*5a02ffc3SAndrew Turner   svint32_t k
302*5a02ffc3SAndrew Turner       = svasr_x (pg, svreinterpret_s32 (top), (23 - V_POWF_EXP2_TABLE_BITS));
303*5a02ffc3SAndrew Turner 
304*5a02ffc3SAndrew Turner   /* Compute core in extended precision and return intermediate ylogx results to
305*5a02ffc3SAndrew Turner       handle cases of underflow and underflow in exp.  */
306*5a02ffc3SAndrew Turner   svfloat32_t ylogx;
307*5a02ffc3SAndrew Turner   svfloat32_t ret = sv_powf_core (pg, i, iz, k, y, sign_bias, &ylogx, d);
308*5a02ffc3SAndrew Turner 
309*5a02ffc3SAndrew Turner   /* Handle exp special cases of underflow and overflow.  */
310*5a02ffc3SAndrew Turner   svuint32_t sign = svlsl_x (pg, sign_bias, 20 - V_POWF_EXP2_TABLE_BITS);
311*5a02ffc3SAndrew Turner   svfloat32_t ret_oflow
312*5a02ffc3SAndrew Turner       = svreinterpret_f32 (svorr_x (pg, sign, asuint (INFINITY)));
313*5a02ffc3SAndrew Turner   svfloat32_t ret_uflow = svreinterpret_f32 (sign);
314*5a02ffc3SAndrew Turner   ret = svsel (svcmple (pg, ylogx, d->uflow_bound), ret_uflow, ret);
315*5a02ffc3SAndrew Turner   ret = svsel (svcmpgt (pg, ylogx, d->oflow_bound), ret_oflow, ret);
316*5a02ffc3SAndrew Turner 
317*5a02ffc3SAndrew Turner   /* Cases of finite y and finite negative x.  */
318*5a02ffc3SAndrew Turner   ret = svsel (yisnotint_xisneg, sv_f32 (__builtin_nanf ("")), ret);
319*5a02ffc3SAndrew Turner 
320*5a02ffc3SAndrew Turner   if (unlikely (svptest_any (pg, cmp)))
321*5a02ffc3SAndrew Turner     return sv_call_powf_sc (x, y, ret, cmp);
322*5a02ffc3SAndrew Turner 
323*5a02ffc3SAndrew Turner   return ret;
324*5a02ffc3SAndrew Turner }
325*5a02ffc3SAndrew Turner 
326*5a02ffc3SAndrew Turner PL_SIG (SV, F, 2, pow)
327*5a02ffc3SAndrew Turner PL_TEST_ULP (SV_NAME_F2 (pow), 2.06)
328*5a02ffc3SAndrew Turner /* Wide intervals spanning the whole domain but shared between x and y.  */
329*5a02ffc3SAndrew Turner #define SV_POWF_INTERVAL2(xlo, xhi, ylo, yhi, n)                               \
330*5a02ffc3SAndrew Turner   PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), xlo, xhi, ylo, yhi, n)                  \
331*5a02ffc3SAndrew Turner   PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), xlo, xhi, -ylo, -yhi, n)                \
332*5a02ffc3SAndrew Turner   PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), -xlo, -xhi, ylo, yhi, n)                \
333*5a02ffc3SAndrew Turner   PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), -xlo, -xhi, -ylo, -yhi, n)
334*5a02ffc3SAndrew Turner SV_POWF_INTERVAL2 (0, 0x1p-126, 0, inf, 40000)
335*5a02ffc3SAndrew Turner SV_POWF_INTERVAL2 (0x1p-126, 1, 0, inf, 50000)
336*5a02ffc3SAndrew Turner SV_POWF_INTERVAL2 (1, inf, 0, inf, 50000)
337*5a02ffc3SAndrew Turner /* x~1 or y~1.  */
338*5a02ffc3SAndrew Turner SV_POWF_INTERVAL2 (0x1p-1, 0x1p1, 0x1p-10, 0x1p10, 10000)
339*5a02ffc3SAndrew Turner SV_POWF_INTERVAL2 (0x1.ep-1, 0x1.1p0, 0x1p8, 0x1p16, 10000)
340*5a02ffc3SAndrew Turner SV_POWF_INTERVAL2 (0x1p-500, 0x1p500, 0x1p-1, 0x1p1, 10000)
341*5a02ffc3SAndrew Turner /* around estimated argmaxs of ULP error.  */
342*5a02ffc3SAndrew Turner SV_POWF_INTERVAL2 (0x1p-300, 0x1p-200, 0x1p-20, 0x1p-10, 10000)
343*5a02ffc3SAndrew Turner SV_POWF_INTERVAL2 (0x1p50, 0x1p100, 0x1p-20, 0x1p-10, 10000)
344*5a02ffc3SAndrew Turner /* x is negative, y is odd or even integer, or y is real not integer.  */
345*5a02ffc3SAndrew Turner PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), -0.0, -10.0, 3.0, 3.0, 10000)
346*5a02ffc3SAndrew Turner PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), -0.0, -10.0, 4.0, 4.0, 10000)
347*5a02ffc3SAndrew Turner PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), -0.0, -10.0, 0.0, 10.0, 10000)
348*5a02ffc3SAndrew Turner PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), 0.0, 10.0, -0.0, -10.0, 10000)
349*5a02ffc3SAndrew Turner /* |x| is inf, y is odd or even integer, or y is real not integer.  */
350*5a02ffc3SAndrew Turner SV_POWF_INTERVAL2 (inf, inf, 0.5, 0.5, 1)
351*5a02ffc3SAndrew Turner SV_POWF_INTERVAL2 (inf, inf, 1.0, 1.0, 1)
352*5a02ffc3SAndrew Turner SV_POWF_INTERVAL2 (inf, inf, 2.0, 2.0, 1)
353*5a02ffc3SAndrew Turner SV_POWF_INTERVAL2 (inf, inf, 3.0, 3.0, 1)
354*5a02ffc3SAndrew Turner /* 0.0^y.  */
355*5a02ffc3SAndrew Turner SV_POWF_INTERVAL2 (0.0, 0.0, 0.0, 0x1p120, 1000)
356*5a02ffc3SAndrew Turner /* 1.0^y.  */
357*5a02ffc3SAndrew Turner PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), 1.0, 1.0, 0.0, 0x1p-50, 1000)
358*5a02ffc3SAndrew Turner PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), 1.0, 1.0, 0x1p-50, 1.0, 1000)
359*5a02ffc3SAndrew Turner PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), 1.0, 1.0, 1.0, 0x1p100, 1000)
360*5a02ffc3SAndrew Turner PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), 1.0, 1.0, -1.0, -0x1p120, 1000)
361