1 //  genertic vectorized math functions
2 //  Copyright (C) 2010 Tim Blechmann
3 //
4 //  This program is free software; you can redistribute it and/or modify
5 //  it under the terms of the GNU General Public License as published by
6 //  the Free Software Foundation; either version 2 of the License, or
7 //  (at your option) any later version.
8 //
9 //  This program is distributed in the hope that it will be useful,
10 //  but WITHOUT ANY WARRANTY; without even the implied warranty of
11 //  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 //  GNU General Public License for more details.
13 //
14 //  You should have received a copy of the GNU General Public License
15 //  along with this program; see the file COPYING.  If not, write to
16 //  the Free Software Foundation, Inc., 51 Franklin Street - Fifth Floor,
17 //  Boston, MA 02110-1301, USA.
18 
19 #ifndef NOVA_SIMD_DETAIL_VECMATH_HPP
20 #define NOVA_SIMD_DETAIL_VECMATH_HPP
21 
22 #include <cmath>
23 
24 #if defined(__GNUC__) && defined(NDEBUG)
25 #define always_inline inline  __attribute__((always_inline))
26 #else
27 #define always_inline inline
28 #endif
29 #include <limits>
30 
31 /* the approximation of mathematical functions should have a maximum relative error of below 5e-7f
32  */
33 
34 namespace nova
35 {
36 namespace detail
37 {
38 
39 template <typename VecType>
vec_sign(VecType const & arg)40 always_inline VecType vec_sign(VecType const & arg)
41 {
42     typedef VecType vec;
43     const vec zero       = vec::gen_zero();
44     const vec one        = vec::gen_one();
45     const vec sign_mask  = vec::gen_sign_mask();
46 
47     const vec nonzero    = mask_neq(arg, zero);
48     const vec sign       = arg & sign_mask;
49 
50     const vec abs_ret    = nonzero & one;
51     return sign | abs_ret;
52 }
53 
54 template <typename VecType>
vec_round_float(VecType const & arg)55 always_inline VecType vec_round_float(VecType const & arg)
56 {
57     typedef VecType vec;
58 
59     const vec sign    = arg & vec::gen_sign_mask();
60     const vec abs_arg = sign ^ arg;
61     const vec two_to_23_ps (0x1.0p23f);
62     const vec rounded = (abs_arg + two_to_23_ps) - two_to_23_ps;
63 
64     return sign ^ rounded;
65 }
66 
67 template <typename VecType>
vec_floor_float(VecType const & arg)68 always_inline VecType vec_floor_float(VecType const & arg)
69 {
70     typedef VecType vec;
71 
72     const vec rounded = vec_round_float(arg);
73 
74     const vec rounded_larger = mask_gt(rounded, arg);
75     const vec add            = rounded_larger & vec::gen_one();
76     return rounded - add;
77 }
78 
79 template <typename VecType>
vec_ceil_float(VecType const & arg)80 always_inline VecType vec_ceil_float(VecType const & arg)
81 {
82     typedef VecType vec;
83 
84     const vec rounded = vec_round_float(arg);
85 
86     const vec rounded_smaller = mask_lt(rounded, arg);
87     const vec add             = rounded_smaller & vec::gen_one();
88     return rounded + add;
89 }
90 
91 template <typename VecFloat>
ldexp_float(VecFloat const & x,typename VecFloat::int_vec const & n)92 always_inline VecFloat ldexp_float(VecFloat const & x, typename VecFloat::int_vec const & n)
93 {
94     typedef typename VecFloat::int_vec int_vec;
95 
96     const VecFloat exponent_mask = VecFloat::gen_exp_mask();
97     const VecFloat exponent = exponent_mask & x;
98     const VecFloat x_wo_x = andnot(exponent_mask, x);     // clear exponent
99 
100     int_vec new_exp = slli(n, 16+7) + int_vec(exponent);  // new exponent
101     VecFloat new_exp_float(new_exp);
102     VecFloat ret = x_wo_x | new_exp_float;
103     return ret;
104 }
105 
106 template <typename VecFloat>
frexp_float(VecFloat const & x,typename VecFloat::int_vec & exp)107 always_inline VecFloat frexp_float(VecFloat const & x, typename VecFloat::int_vec & exp)
108 {
109     typedef typename VecFloat::int_vec int_vec;
110 
111     const VecFloat exponent_mask = VecFloat::gen_exp_mask();
112     const VecFloat exponent = exponent_mask & x;
113     const VecFloat x_wo_x = andnot(exponent_mask, x);             // clear exponent
114 
115     const int_vec exp_int(exponent);
116 
117     exp = srli(exp_int, 16+7) - int_vec(126);
118     return x_wo_x | VecFloat::gen_exp_mask_1();
119 }
120 
121 /* adapted from cephes, approximation polynomial generated by sollya */
122 template <typename VecType>
vec_exp_float(VecType const & arg)123 always_inline VecType vec_exp_float(VecType const & arg)
124 {
125     typedef typename VecType::int_vec int_vec;
126 
127     /* Express e**x = e**g 2**n
128      *   = e**g e**( n loge(2) )
129      *   = e**( g + n loge(2) )
130      */
131 
132     // black magic
133     VecType x = arg;
134     VecType z = round(VecType(1.44269504088896341f) * x);
135     int_vec n = z.truncate_to_int();
136     x -= z*VecType(0.693359375f);
137     x -= z*VecType(-2.12194440e-4f);
138 
139     /* Theoretical peak relative error in [-0.5, +0.5] is 3.5e-8. */
140     VecType p = VecType(VecType::gen_one()) +
141         x * (1.00000035762786865234375f +
142         x * (0.4999996721744537353515625f +
143         x * (0.16665561497211456298828125f +
144         x * (4.167006909847259521484375e-2f +
145         x * (8.420792408287525177001953125e-3f +
146         x * 1.386119984090328216552734375e-3f)))));
147 
148     /* multiply by power of 2 */
149     VecType approx = ldexp_float(p, n);
150 
151     /* handle min/max boundaries */
152     const VecType maxlogf(88.72283905206835f);
153     const VecType minlogf(-88.0f);
154     const VecType max_float(std::numeric_limits<float>::max());
155     const VecType zero = VecType::gen_zero();
156 
157     VecType too_large = mask_gt(arg, maxlogf);
158     VecType too_small = mask_lt(arg, minlogf);
159 
160     VecType ret = select(approx, max_float, too_large);
161     ret = select(ret, zero, too_small);
162 
163     return ret;
164 }
165 
166 /* adapted from cephes */
167 template <typename VecType>
vec_log_float(VecType x)168 always_inline VecType vec_log_float(VecType x)
169 {
170     typedef typename VecType::int_vec int_vec;
171 
172     int_vec e;
173     x = frexp_float( x, e );
174 
175     const VecType sqrt_05 = 0.707106781186547524f;
176     const VecType x_smaller_sqrt_05 = mask_lt(x, sqrt_05);
177     e = e + int_vec(x_smaller_sqrt_05);
178     VecType x_add = x;
179     x_add = x_add & x_smaller_sqrt_05;
180     x += x_add - VecType(VecType::gen_one());
181 
182     VecType y =
183     (((((((( 7.0376836292E-2 * x
184     - 1.1514610310E-1) * x
185     + 1.1676998740E-1) * x
186     - 1.2420140846E-1) * x
187     + 1.4249322787E-1) * x
188     - 1.6668057665E-1) * x
189     + 2.0000714765E-1) * x
190     - 2.4999993993E-1) * x
191     + 3.3333331174E-1) * x * x*x;
192 
193     VecType fe = e.convert_to_float();
194     y += fe * -2.12194440e-4;
195 
196     y -= 0.5 * x*x;            /* y - 0.5 x^2 */
197     VecType z  = x + y;        /* ... + x  */
198 
199     return z + 0.693359375 * fe;
200 }
201 
202 
203 /* exp function for vec_tanh_float. similar to vec_exp_tanh, but without boundary checks */
204 template <typename VecType>
vec_exp_tanh_float(VecType const & arg)205 always_inline VecType vec_exp_tanh_float(VecType const & arg)
206 {
207     typedef typename VecType::int_vec int_vec;
208 
209     /* Express e**x = e**g 2**n
210      *   = e**g e**( n loge(2) )
211      *   = e**( g + n loge(2) )
212      */
213 
214     // black magic
215     VecType x = arg;
216     VecType z = round(VecType(1.44269504088896341f) * x);
217     int_vec n = z.truncate_to_int();
218     x -= z*VecType(0.693359375f);
219     x -= z*VecType(-2.12194440e-4f);
220 
221     /* Theoretical peak relative error in [-0.5, +0.5] is 3.5e-8. */
222     VecType p = 1.f +
223         x * (1.00000035762786865234375f +
224         x * (0.4999996721744537353515625f +
225         x * (0.16665561497211456298828125f +
226         x * (4.167006909847259521484375e-2f +
227         x * (8.420792408287525177001953125e-3f +
228         x * 1.386119984090328216552734375e-3f)))));
229 
230     /* multiply by power of 2 */
231     VecType approx = ldexp_float(p, n);
232 
233     return approx;
234 }
235 
236 
237 /* adapted from Julien Pommier's sse_mathfun.h, itself based on cephes */
238 template <typename VecType>
vec_sin_float(VecType const & arg)239 always_inline VecType vec_sin_float(VecType const & arg)
240 {
241     typedef typename VecType::int_vec int_vec;
242 
243     const typename VecType::float_type four_over_pi = 1.27323954473516268615107010698011489627567716592367;
244 
245     VecType sign = arg & VecType::gen_sign_mask();
246     VecType abs_arg = arg & VecType::gen_abs_mask();
247 
248     VecType y = abs_arg * four_over_pi;
249 
250     int_vec j = y.truncate_to_int();
251 
252     /* cephes: j=(j+1) & (~1) */
253     j = (j + int_vec(1)) & int_vec(~1);
254     y = j.convert_to_float();
255 
256     /* sign based on quadrant */
257     VecType swap_sign_bit = slli(j & int_vec(4), 29);
258     sign = sign ^ swap_sign_bit;
259 
260     /* polynomial mask */
261     VecType poly_mask = VecType (mask_eq(j & int_vec(2), int_vec(0)));
262 
263     /* black magic */
264     static float DP1 = 0.78515625;
265     static float DP2 = 2.4187564849853515625e-4;
266     static float DP3 = 3.77489497744594108e-8;
267     VecType base = ((abs_arg - y * DP1) - y * DP2) - y * DP3;
268 
269     /* [0..pi/4] */
270     VecType z = base * base;
271     VecType p1 = ((  2.443315711809948E-005 * z
272         - 1.388731625493765E-003) * z
273         + 4.166664568298827E-002) * z * z
274         -0.5f * z + 1.0
275         ;
276 
277     /* [pi/4..pi/2] */
278     VecType p2 = ((-1.9515295891E-4 * z
279          + 8.3321608736E-3) * z
280          - 1.6666654611E-1) * z * base + base;
281 
282     VecType approximation =  select(p1, p2, poly_mask);
283 
284     return approximation ^ sign;
285 }
286 
287 /* adapted from Julien Pommier's sse_mathfun.h, itself based on cephes */
288 template <typename VecType>
vec_cos_float(VecType const & arg)289 always_inline VecType vec_cos_float(VecType const & arg)
290 {
291     typedef typename VecType::int_vec int_vec;
292 
293     const typename VecType::float_type four_over_pi = 1.27323954473516268615107010698011489627567716592367;
294 
295     VecType abs_arg = arg & VecType::gen_abs_mask();
296 
297     VecType y = abs_arg * four_over_pi;
298 
299     int_vec j = y.truncate_to_int();
300 
301     /* cephes: j=(j+1) & (~1) */
302     j = (j + int_vec(1)) & int_vec(~1);
303     y = j.convert_to_float();
304 
305     /* sign based on quadrant */
306     int_vec jm2 = j - int_vec(2);
307     VecType sign = slli(andnot(jm2, int_vec(4)), 29);
308 
309     /* polynomial mask */
310     VecType poly_mask = VecType (mask_eq(jm2 & int_vec(2), int_vec(0)));
311 
312     /* black magic */
313     static float DP1 = 0.78515625;
314     static float DP2 = 2.4187564849853515625e-4;
315     static float DP3 = 3.77489497744594108e-8;
316     VecType base = ((abs_arg - y * DP1) - y * DP2) - y * DP3;
317 
318     /* [0..pi/4] */
319     VecType z = base * base;
320     VecType p1 = ((  2.443315711809948E-005 * z
321         - 1.388731625493765E-003) * z
322         + 4.166664568298827E-002) * z * z
323         -0.5f * z + 1
324         ;
325 
326     /* [pi/4..pi/2] */
327     VecType p2 = ((-1.9515295891E-4 * z
328          + 8.3321608736E-3) * z
329          - 1.6666654611E-1) * z * base + base;
330 
331     VecType approximation =  select(p1, p2, poly_mask);
332 
333     return approximation ^ sign;
334 }
335 
336 /* adapted from cephes, approximation polynomial generted by sollya */
337 template <typename VecType>
vec_tan_float(VecType const & arg)338 always_inline VecType vec_tan_float(VecType const & arg)
339 {
340     typedef typename VecType::int_vec int_vec;
341     const typename VecType::float_type four_over_pi = 1.27323954473516268615107010698011489627567716592367;
342 
343     VecType sign = arg & VecType::gen_sign_mask();
344     VecType abs_arg = arg & VecType::gen_abs_mask();
345 
346     VecType y = abs_arg * four_over_pi;
347     int_vec j = y.truncate_to_int();
348 
349     /* cephes: j=(j+1) & (~1) */
350     j = (j + int_vec(1)) & int_vec(~1);
351     y = j.convert_to_float();
352 
353     /* approximation mask */
354     VecType poly_mask = VecType (mask_eq(j & int_vec(2), int_vec(0)));
355 
356     /* black magic */
357     static float DP1 = 0.78515625;
358     static float DP2 = 2.4187564849853515625e-4;
359     static float DP3 = 3.77489497744594108e-8;
360     VecType base = ((abs_arg - y * DP1) - y * DP2) - y * DP3;
361 
362     VecType x = base; VecType x2 = x*x;
363 
364     // sollya: fpminimax(tan(x), [|3,5,7,9,11,13|], [|24...|], [-pi/4,pi/4], x);
365     VecType approx =
366         x + x * x2 * (0.3333315551280975341796875 + x2 * (0.1333882510662078857421875 + x2 * (5.3409568965435028076171875e-2 + x2 * (2.443529665470123291015625e-2 + x2 * (3.1127030961215496063232421875e-3 + x2 * 9.3892104923725128173828125e-3)))));
367 
368     //VecType recip = -reciprocal(approx);
369     VecType recip = -1.0 / approx;
370 
371     VecType approximation = select(recip, approx, poly_mask);
372 
373     return approximation ^ sign;
374 }
375 
376 /* adapted from cephes, approximation polynomial generted by sollya */
377 template <typename VecType>
vec_asin_float(VecType const & arg)378 always_inline VecType vec_asin_float(VecType const & arg)
379 {
380     VecType abs_arg = arg & VecType::gen_abs_mask();
381     VecType sign = arg & VecType::gen_sign_mask();
382     VecType one = VecType::gen_one();
383     VecType half = VecType::gen_05();
384     VecType zero = VecType::gen_zero();
385 
386     // range redution: asin(x) = pi/2 - 2 asin( sqrt( (1-x)/2 ) ). for |arg| > 0.5
387     VecType arg_greater_05 = mask_gt(abs_arg, 0.5);
388     // VecType arg_reduced_sqr = (one - abs_arg) * half;
389     VecType arg_reduced = sqrt((one - abs_arg) * half);
390     VecType approx_arg = select(abs_arg, arg_reduced, arg_greater_05);
391 
392 
393     // VecType z = select(abs_arg*abs_arg, arg_reduced_sqr, arg_greater_05);
394 
395     VecType x = approx_arg; VecType x2 = x*x;
396     // sollya: fpminimax(asin(x), [|3,5,7,9,11|], [|24...|], [0.000000000000000000001,0.5], x);
397     VecType approx_poly = x + x * x2 * (0.166667520999908447265625 + x2 * (7.4953101575374603271484375e-2 + x2 * (4.54690195620059967041015625e-2 + x2 * (2.418550290167331695556640625e-2 + x2 * 4.21570129692554473876953125e-2))));
398 
399     VecType approx_poly_reduced = 1.57079637050628662109375 - approx_poly - approx_poly;
400     VecType approx = select(approx_poly, approx_poly_reduced, arg_greater_05);
401 
402     approx = approx ^ sign;
403     // |arg| > 1: return 0
404     VecType ret = select(approx, zero, mask_gt(abs_arg, one));
405     return ret;
406 }
407 
408 /* based on asin approximation:
409  *
410  * x < -0.5:        acos(x) = pi - 2.0 * asin( sqrt((1+x)/2) );
411  * -0.5 < x < 0.5   acos(x) = pi/2 - asin(x)
412  * x > 0.5          acos(x) =      2.0 * asin( sqrt((1-x)/2) ).
413  *
414  */
415 template <typename VecType>
vec_acos_float(VecType const & arg)416 always_inline VecType vec_acos_float(VecType const & arg)
417 {
418     VecType abs_arg = arg & VecType::gen_abs_mask();
419     VecType one = VecType::gen_one();
420     VecType half = VecType::gen_05();
421     VecType zero = VecType::gen_zero();
422 
423     VecType arg_greater_05 = mask_gt(abs_arg, half);
424     VecType asin_arg_greater_05 = sqrt((one - abs_arg) * half);
425 
426     VecType asin_arg = select(arg, asin_arg_greater_05, arg_greater_05);
427 
428     VecType asin = vec_asin_float(asin_arg);
429     VecType two_asin = asin + asin;
430 
431     VecType ret_m1_m05 = 3.1415927410125732421875 - two_asin;
432     VecType ret_m05_05 = 1.57079637050628662109375 - asin;
433     VecType ret_05_1 = two_asin;
434 
435     VecType ret_m05_1 = select(ret_m05_05, ret_05_1, mask_gt(arg, half));
436     VecType ret = select(ret_m1_m05, ret_m05_1, mask_gt(arg, -0.5));
437 
438     // |arg| > 1: return 0
439     ret = select(ret, zero, mask_gt(abs_arg, one));
440     return ret;
441 }
442 
443 /* adapted from cephes */
444 template <typename VecType>
vec_atan_float(VecType const & arg)445 always_inline VecType vec_atan_float(VecType const & arg)
446 {
447     const VecType sign_arg = arg & VecType::gen_sign_mask();
448     const VecType abs_arg  = arg & VecType::gen_abs_mask();
449     const VecType one      = VecType::gen_one();
450     VecType zero           = VecType::gen_zero();
451 
452     VecType arg_range0 = abs_arg;
453     VecType arg_range1 = (abs_arg - one) / (abs_arg + one);
454     VecType arg_range2 = -one / abs_arg;
455 
456     VecType offset_range0 = zero;
457     VecType offset_range1 = 0.78539816339744830961566084581987572104929234984377;
458     VecType offset_range2 = 1.57079632679489661923132169163975144209858469968754;
459 
460     VecType mask_range_01 = mask_gt(abs_arg, 0.41421356237309504880168872420969807856967187537695);
461     VecType mask_range_12 = mask_gt(abs_arg, 2.41421356237309504880168872420969807856967187537698);
462 
463     VecType approx_arg = select(arg_range0,
464                                 select(arg_range1, arg_range2, mask_range_12),
465                                 mask_range_01);
466 
467     VecType approx_offset = select(offset_range0,
468                                    select(offset_range1, offset_range2, mask_range_12),
469                                    mask_range_01);
470 
471 
472     VecType x = approx_arg;
473     VecType x2 = x*x;
474 
475     VecType approx = approx_offset +
476        x + x * x2 * (-0.333329498767852783203125 + x2 * (0.19977732002735137939453125 + x2 * (-0.1387787759304046630859375 + x2 * 8.054284751415252685546875e-2)));
477 
478     return approx ^ sign_arg;
479 }
480 
481 
482 template <typename VecType>
vec_tanh_float(VecType const & arg)483 always_inline VecType vec_tanh_float(VecType const & arg)
484 {
485     /* this order of computation (large->small->medium) seems to be the most efficient on sse*/
486 
487     const VecType sign_arg = arg & VecType::gen_sign_mask();
488     const VecType abs_arg  = arg ^ sign_arg;
489     const VecType one      = VecType::gen_one();
490     const VecType two (2.f);
491     const VecType maxlogf_2 (22.f);
492     const VecType limit_small (0.625f);
493 
494 	/* medium values */
495     const VecType result_medium_abs = one - two / (vec_exp_tanh_float(abs_arg + abs_arg) + one);
496 
497     /* large values */
498     const VecType abs_big          = mask_gt(abs_arg, maxlogf_2);
499     const VecType result_limit_abs = one;
500 
501     /* small values */
502     const VecType f1((float)-5.70498872745e-3);
503     const VecType f2((float) 2.06390887954e-2);
504     const VecType f3((float)-5.37397155531e-2);
505     const VecType f4((float) 1.33314422036e-1);
506     const VecType f5((float)-3.33332819422e-1);
507 
508     const VecType arg_sqr = abs_arg * abs_arg;
509     const VecType result_small = ((((f1 * arg_sqr
510                                      + f2) * arg_sqr
511                                     + f3) * arg_sqr
512                                    + f4) * arg_sqr
513                                   + f5) * arg_sqr * arg
514         + arg;
515 
516     const VecType abs_small = mask_lt(abs_arg, limit_small);
517 
518     /* select from large and medium branches and set sign */
519     const VecType result_lm_abs = select(result_medium_abs, result_limit_abs, abs_big);
520     const VecType result_lm = result_lm_abs | sign_arg;
521 
522     const VecType result = select(result_lm, result_small, abs_small);
523 
524     return result;
525 }
526 
527 template <typename VecType>
vec_signed_pow(VecType arg1,VecType arg2)528 always_inline VecType vec_signed_pow(VecType arg1, VecType arg2)
529 {
530     const VecType sign_arg1 = arg1 & VecType::gen_sign_mask();
531     const VecType abs_arg1  = arg1 ^ sign_arg1;
532 
533     const VecType result = pow(abs_arg1, arg2);
534 
535     return sign_arg1 | result;
536 }
537 
538 /* compute pow using exp and log. seems to be faster than table-based algorithms */
539 template <typename VecType>
vec_pow(VecType arg1,VecType arg2)540 always_inline VecType vec_pow(VecType arg1, VecType arg2)
541 {
542     const VecType zero      = VecType::gen_zero();
543     const VecType arg1_zero = mask_eq(arg1, zero);
544 
545     const VecType result = exp(arg2 * log(arg1));
546     return select(result, zero, arg1_zero);
547 }
548 
549 template <typename VecType>
vec_signed_sqrt(VecType arg)550 always_inline VecType vec_signed_sqrt(VecType arg)
551 {
552     const VecType sign_arg1 = arg & VecType::gen_sign_mask();
553     const VecType abs_arg1  = arg ^ sign_arg1;
554 
555     const VecType result = sqrt(abs_arg1);
556 
557     return sign_arg1 | result;
558 }
559 
560 template <typename VecType>
vec_log2(VecType arg)561 always_inline VecType vec_log2(VecType arg)
562 {
563     const double rlog2 = 1.0/std::log(2.0);
564     return log(arg) * VecType((typename VecType::float_type)rlog2);
565 }
566 
567 template <typename VecType>
vec_log10(VecType arg)568 always_inline VecType vec_log10(VecType arg)
569 {
570     const double rlog10 = 1.0/std::log(10.0);
571     return log(arg) * VecType((typename VecType::float_type)rlog10);
572 }
573 
574 template <typename VecType>
vec_select(VecType lhs,VecType rhs,VecType bitmask)575 always_inline VecType vec_select(VecType lhs, VecType rhs, VecType bitmask)
576 {
577     const VecType result = andnot(bitmask, lhs) | (bitmask & rhs);
578     return result;
579 }
580 
581 
582 template <typename VecType>
vec_undenormalize(VecType arg)583 always_inline VecType vec_undenormalize(VecType arg)
584 {
585     typedef typename VecType::float_type float_type;
586     const float_type min_positive_value = std::numeric_limits<float_type>::min();
587 
588     const VecType abs_arg = abs(arg);
589     const VecType abs_arg_lt_min = mask_lt(abs_arg, min_positive_value);
590     const VecType zero = VecType::gen_zero();
591 
592     const VecType result = select(arg, zero, abs_arg_lt_min);
593     return result;
594 }
595 
596 template <typename VecType>
vec_reciprocal_newton(VecType arg)597 always_inline VecType vec_reciprocal_newton(VecType arg)
598 {
599     const VecType one = VecType::gen_one();
600 
601     const VecType approx = fast_reciprocal(arg);
602 
603     // One round of Newton-Raphson refinement
604     const VecType diff = one - approx * arg;
605     const VecType result = madd(diff, approx, approx);
606 
607     return result;
608 }
609 
610 
611 }
612 }
613 
614 #undef always_inline
615 
616 #endif /* NOVA_SIMD_DETAIL_VECMATH_HPP */
617