1 /*
2  * Single-precision inverse error function (AdvSIMD variant).
3  *
4  * Copyright (c) 2023, Arm Limited.
5  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6  */
7 #include "v_math.h"
8 #include "pl_sig.h"
9 #include "pl_test.h"
10 #include "poly_advsimd_f32.h"
11 #include "v_logf_inline.h"
12 
13 const static struct data
14 {
15   /*  We use P_N and Q_N to refer to arrays of coefficients, where P_N is the
16       coeffs of the numerator in table N of Blair et al, and Q_N is the coeffs
17       of the denominator. Coefficients are stored in various interleaved
18       formats to allow for table-based (vector-to-vector) lookup.
19 
20       Plo is first two coefficients of P_10 and P_29 interleaved.
21       PQ is third coeff of P_10 and first of Q_29 interleaved.
22       Qhi is second and third coeffs of Q_29 interleaved.
23       P29_3 is a homogenous vector with fourth coeff of P_29.
24 
25       P_10 and Q_10 are also stored in homogenous vectors to allow better
26       memory access when no lanes are in a tail region.  */
27   float32x4_t Plo, PQ, Qhi, P29_3, tailshift;
28   float32x4_t P_50[6], Q_50[2];
29   float32x4_t P_10[3], Q_10[3];
30   uint8x16_t idxhi, idxlo;
31   struct v_logf_data logf_tbl;
32 } data = {
33   .idxlo = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
34   .idxhi = { 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23 },
35   .P29_3 = V4 (0x1.b13626p-2),
36   .tailshift = V4 (-0.87890625),
37   .Plo = { -0x1.a31268p+3, -0x1.fc0252p-4, 0x1.ac9048p+4, 0x1.119d44p+0 },
38   .PQ = { -0x1.293ff6p+3, -0x1.f59ee2p+0, -0x1.8265eep+3, -0x1.69952p-4 },
39   .Qhi = { 0x1.ef5eaep+4, 0x1.c7b7d2p-1, -0x1.12665p+4, -0x1.167d7p+1 },
40   .P_50 = { V4 (0x1.3d8948p-3), V4 (0x1.61f9eap+0), V4 (0x1.61c6bcp-1),
41 	    V4 (-0x1.20c9f2p+0), V4 (0x1.5c704cp-1), V4 (-0x1.50c6bep-3) },
42   .Q_50 = { V4 (0x1.3d7dacp-3), V4 (0x1.629e5p+0) },
43   .P_10 = { V4 (-0x1.a31268p+3), V4 (0x1.ac9048p+4), V4 (-0x1.293ff6p+3) },
44   .Q_10 = { V4 (-0x1.8265eep+3), V4 (0x1.ef5eaep+4), V4 (-0x1.12665p+4) },
45   .logf_tbl = V_LOGF_CONSTANTS
46 };
47 
48 static inline float32x4_t
special(float32x4_t x,const struct data * d)49 special (float32x4_t x, const struct data *d)
50 {
51   /* Note erfinvf(inf) should return NaN, and erfinvf(1) should return Inf.
52      By using log here, instead of log1p, we return finite values for both
53      these inputs, and values outside [-1, 1]. This is non-compliant, but is an
54      acceptable optimisation at Ofast. To get correct behaviour for all finite
55      values use the log1pf_inline helper on -abs(x) - note that erfinvf(inf)
56      will still be finite.  */
57   float32x4_t t = vdivq_f32 (
58       v_f32 (1), vsqrtq_f32 (vnegq_f32 (v_logf_inline (
59 		     vsubq_f32 (v_f32 (1), vabsq_f32 (x)), &d->logf_tbl))));
60   float32x4_t ts = vbslq_f32 (v_u32 (0x7fffffff), t, x);
61   float32x4_t q = vfmaq_f32 (d->Q_50[0], vaddq_f32 (t, d->Q_50[1]), t);
62   return vdivq_f32 (v_horner_5_f32 (t, d->P_50), vmulq_f32 (ts, q));
63 }
64 
65 static inline float32x4_t
notails(float32x4_t x,const struct data * d)66 notails (float32x4_t x, const struct data *d)
67 {
68   /* Shortcut when no input is in a tail region - no need to gather shift or
69      coefficients.  */
70   float32x4_t t = vfmaq_f32 (v_f32 (-0.5625), x, x);
71   float32x4_t q = vaddq_f32 (t, d->Q_10[2]);
72   q = vfmaq_f32 (d->Q_10[1], t, q);
73   q = vfmaq_f32 (d->Q_10[0], t, q);
74 
75   return vdivq_f32 (vmulq_f32 (x, v_horner_2_f32 (t, d->P_10)), q);
76 }
77 
78 static inline float32x4_t
lookup(float32x4_t tbl,uint8x16_t idx)79 lookup (float32x4_t tbl, uint8x16_t idx)
80 {
81   return vreinterpretq_f32_u8 (vqtbl1q_u8 (vreinterpretq_u8_f32 (tbl), idx));
82 }
83 
84 /* Vector implementation of Blair et al's rational approximation to inverse
85    error function in single-precision. Worst-case error is 4.98 ULP, in the
86    tail region:
87    _ZGVnN4v_erfinvf(0x1.f7dbeep-1) got 0x1.b4793p+0
88 				  want 0x1.b4793ap+0 .  */
V_NAME_F1(erfinv)89 float32x4_t VPCS_ATTR V_NAME_F1 (erfinv) (float32x4_t x)
90 {
91   const struct data *d = ptr_barrier (&data);
92 
93   /* Calculate inverse error using algorithm described in
94      J. M. Blair, C. A. Edwards, and J. H. Johnson,
95      "Rational Chebyshev approximations for the inverse of the error
96       function", Math. Comp. 30, pp. 827--830 (1976).
97      https://doi.org/10.1090/S0025-5718-1976-0421040-7.
98 
99     Algorithm has 3 intervals:
100      - 'Normal' region [-0.75, 0.75]
101      - Tail region [0.75, 0.9375] U [-0.9375, -0.75]
102      - Extreme tail [-1, -0.9375] U [0.9375, 1]
103      Normal and tail are both rational approximation of similar order on
104      shifted input - these are typically performed in parallel using gather
105      loads to obtain correct coefficients depending on interval.  */
106   uint32x4_t is_tail = vcageq_f32 (x, v_f32 (0.75));
107   uint32x4_t extreme_tail = vcageq_f32 (x, v_f32 (0.9375));
108 
109   if (unlikely (!v_any_u32 (is_tail)))
110     /* Shortcut for if all lanes are in [-0.75, 0.75] - can avoid having to
111        gather coefficients. If input is uniform in [-1, 1] then likelihood of
112        this is 0.75^4 ~= 0.31.  */
113     return notails (x, d);
114 
115   /* Select requisite shift depending on interval: polynomial is evaluated on
116      x * x - shift.
117      Normal shift = 0.5625
118      Tail shift   = 0.87890625.  */
119   float32x4_t t
120       = vfmaq_f32 (vbslq_f32 (is_tail, d->tailshift, v_f32 (-0.5625)), x, x);
121 
122   /* Calculate indexes for tbl: tbl is byte-wise, so:
123      [0, 1, 2, 3, 4, 5, 6, ....] copies the vector
124      Add 4 * i to a group of 4 lanes to copy 32-bit lane i. Each vector stores
125      two pairs of coeffs, so we need two idx vectors - one for each pair.  */
126   uint8x16_t off = vandq_u8 (vreinterpretq_u8_u32 (is_tail), vdupq_n_u8 (4));
127   uint8x16_t idx_lo = vaddq_u8 (d->idxlo, off);
128   uint8x16_t idx_hi = vaddq_u8 (d->idxhi, off);
129 
130   /* Load the tables.  */
131   float32x4_t p_lo = d->Plo;
132   float32x4_t pq = d->PQ;
133   float32x4_t qhi = d->Qhi;
134 
135   /* Do the lookup (and calculate p3 by masking non-tail lanes).  */
136   float32x4_t p3 = vreinterpretq_f32_u32 (
137       vandq_u32 (is_tail, vreinterpretq_u32_f32 (d->P29_3)));
138   float32x4_t p0 = lookup (p_lo, idx_lo), p1 = lookup (p_lo, idx_hi),
139 	      p2 = lookup (pq, idx_lo), q0 = lookup (pq, idx_hi),
140 	      q1 = lookup (qhi, idx_lo), q2 = lookup (qhi, idx_hi);
141 
142   float32x4_t p = vfmaq_f32 (p2, p3, t);
143   p = vfmaq_f32 (p1, p, t);
144   p = vfmaq_f32 (p0, p, t);
145   p = vmulq_f32 (x, p);
146 
147   float32x4_t q = vfmaq_f32 (q1, vaddq_f32 (q2, t), t);
148   q = vfmaq_f32 (q0, q, t);
149 
150   if (unlikely (v_any_u32 (extreme_tail)))
151     /* At least one lane is in the extreme tail - if input is uniform in
152        [-1, 1] the likelihood of this is ~0.23.  */
153     return vbslq_f32 (extreme_tail, special (x, d), vdivq_f32 (p, q));
154 
155   return vdivq_f32 (p, q);
156 }
157 
158 PL_SIG (V, F, 1, erfinv, -0.99, 0.99)
159 PL_TEST_ULP (V_NAME_F1 (erfinv), 4.49)
160 /* Test with control lane in each interval.  */
161 PL_TEST_SYM_INTERVAL_C (V_NAME_F1 (erfinv), 0, 0x1.fffffep-1, 40000, 0.5)
162 PL_TEST_SYM_INTERVAL_C (V_NAME_F1 (erfinv), 0, 0x1.fffffep-1, 40000, 0.8)
163 PL_TEST_SYM_INTERVAL_C (V_NAME_F1 (erfinv), 0, 0x1.fffffep-1, 40000, 0.95)
164