1 //  genertic vectorized math functions
2 //  Copyright (C) 2010 Tim Blechmann
3 //
4 //  This program is free software; you can redistribute it and/or modify
5 //  it under the terms of the GNU General Public License as published by
6 //  the Free Software Foundation; either version 2 of the License, or
7 //  (at your option) any later version.
8 //
9 //  This program is distributed in the hope that it will be useful,
10 //  but WITHOUT ANY WARRANTY; without even the implied warranty of
11 //  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 //  GNU General Public License for more details.
13 //
14 //  You should have received a copy of the GNU General Public License
15 //  along with this program; see the file COPYING.  If not, write to
16 //  the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
17 //  Boston, MA 02111-1307, USA.
18 
19 #ifndef NOVA_SIMD_DETAIL_VECMATH_HPP
20 #define NOVA_SIMD_DETAIL_VECMATH_HPP
21 
22 #include <cmath>
23 
24 #if defined(__GNUC__) && defined(NDEBUG)
25 #define always_inline inline  __attribute__((always_inline))
26 #else
27 #define always_inline inline
28 #endif
29 #include <limits>
30 
31 /* the approximation of mathematical functions should have a maximum relative error of below 5e-7f
32  */
33 
34 namespace nova
35 {
36 namespace detail
37 {
38 
39 template <typename VecType>
vec_sign(VecType const & arg)40 always_inline VecType vec_sign(VecType const & arg)
41 {
42     typedef VecType vec;
43     const vec zero       = vec::gen_zero();
44     const vec one        = vec::gen_one();
45     const vec sign_mask  = vec::gen_sign_mask();
46 
47     const vec nonzero    = mask_neq(arg, zero);
48     const vec sign       = arg & sign_mask;
49 
50     const vec abs_ret    = nonzero & one;
51     return sign | abs_ret;
52 }
53 
54 template <typename VecType>
vec_round_float(VecType const & arg)55 always_inline VecType vec_round_float(VecType const & arg)
56 {
57     typedef VecType vec;
58 
59     const vec sign    = arg & vec::gen_sign_mask();
60     const vec abs_arg = sign ^ arg;
61     const vec two_to_23_ps (0x1.0p23f);
62     const vec rounded = (abs_arg + two_to_23_ps) - two_to_23_ps;
63 
64     return sign ^ rounded;
65 }
66 
67 template <typename VecType>
vec_floor_float(VecType const & arg)68 always_inline VecType vec_floor_float(VecType const & arg)
69 {
70     typedef VecType vec;
71 
72     const vec rounded = vec_round_float(arg);
73 
74     const vec rounded_larger = mask_gt(rounded, arg);
75     const vec add            = rounded_larger & vec::gen_one();
76     return rounded - add;
77 }
78 
79 template <typename VecType>
vec_ceil_float(VecType const & arg)80 always_inline VecType vec_ceil_float(VecType const & arg)
81 {
82     typedef VecType vec;
83 
84     const vec rounded = vec_round_float(arg);
85 
86     const vec rounded_smaller = mask_lt(rounded, arg);
87     const vec add             = rounded_smaller & vec::gen_one();
88     return rounded + add;
89 }
90 
91 template <typename VecFloat>
ldexp_float(VecFloat const & x,typename VecFloat::int_vec const & n)92 always_inline VecFloat ldexp_float(VecFloat const & x, typename VecFloat::int_vec const & n)
93 {
94     typedef typename VecFloat::int_vec int_vec;
95 
96     const VecFloat exponent_mask = VecFloat::gen_exp_mask();
97     const VecFloat exponent = exponent_mask & x;
98     const VecFloat x_wo_x = andnot(exponent_mask, x);     // clear exponent
99 
100     int_vec new_exp = slli(n, 16+7) + int_vec(exponent);  // new exponent
101     VecFloat new_exp_float(new_exp);
102     VecFloat ret = x_wo_x | new_exp_float;
103     return ret;
104 }
105 
106 template <typename VecFloat>
frexp_float(VecFloat const & x,typename VecFloat::int_vec & exp)107 always_inline VecFloat frexp_float(VecFloat const & x, typename VecFloat::int_vec & exp)
108 {
109     typedef typename VecFloat::int_vec int_vec;
110 
111     const VecFloat exponent_mask = VecFloat::gen_exp_mask();
112     const VecFloat exponent = exponent_mask & x;
113     const VecFloat x_wo_x = andnot(exponent_mask, x);             // clear exponent
114 
115     const int_vec exp_int(exponent);
116 
117     exp = srli(exp_int, 16+7) - int_vec(126);
118     return x_wo_x | VecFloat::gen_exp_mask_1();
119 }
120 
121 /* adapted from cephes, approximation polynomial generated by sollya */
122 template <typename VecType>
vec_exp_float(VecType const & arg)123 always_inline VecType vec_exp_float(VecType const & arg)
124 {
125     typedef typename VecType::int_vec int_vec;
126 
127     /* Express e**x = e**g 2**n
128      *   = e**g e**( n loge(2) )
129      *   = e**( g + n loge(2) )
130      */
131 
132     // black magic
133     VecType x = arg;
134     VecType z = round(VecType(1.44269504088896341f) * x);
135     int_vec n = z.truncate_to_int();
136     x -= z*VecType(0.693359375f);
137     x -= z*VecType(-2.12194440e-4f);
138 
139     /* Theoretical peak relative error in [-0.5, +0.5] is 3.5e-8. */
140     VecType p = VecType(VecType::gen_one()) +
141         x * (1.00000035762786865234375f +
142         x * (0.4999996721744537353515625f +
143         x * (0.16665561497211456298828125f +
144         x * (4.167006909847259521484375e-2f +
145         x * (8.420792408287525177001953125e-3f +
146         x * 1.386119984090328216552734375e-3f)))));
147 
148     /* multiply by power of 2 */
149     VecType approx = ldexp_float(p, n);
150 
151     /* handle min/max boundaries */
152     const VecType maxlogf(88.72283905206835f);
153 //    const VecType minlogf(-103.278929903431851103f);
154 	const VecType minlogf = -maxlogf;
155     const VecType max_float(std::numeric_limits<float>::max());
156     const VecType zero = VecType::gen_zero();
157 
158     VecType too_large = mask_gt(arg, maxlogf);
159     VecType too_small = mask_lt(arg, minlogf);
160 
161     VecType ret = select(approx, max_float, too_large);
162     ret = select(ret, zero, too_small);
163 
164     return ret;
165 }
166 
167 /* adapted from cephes */
168 template <typename VecType>
vec_log_float(VecType x)169 always_inline VecType vec_log_float(VecType x)
170 {
171     typedef typename VecType::int_vec int_vec;
172 
173     int_vec e;
174     x = frexp_float( x, e );
175 
176     const VecType sqrt_05 = 0.707106781186547524f;
177     const VecType x_smaller_sqrt_05 = mask_lt(x, sqrt_05);
178     e = e + int_vec(x_smaller_sqrt_05);
179     VecType x_add = x;
180     x_add = x_add & x_smaller_sqrt_05;
181     x += x_add - VecType(VecType::gen_one());
182 
183     VecType y =
184     (((((((( 7.0376836292E-2 * x
185     - 1.1514610310E-1) * x
186     + 1.1676998740E-1) * x
187     - 1.2420140846E-1) * x
188     + 1.4249322787E-1) * x
189     - 1.6668057665E-1) * x
190     + 2.0000714765E-1) * x
191     - 2.4999993993E-1) * x
192     + 3.3333331174E-1) * x * x*x;
193 
194     VecType fe = e.convert_to_float();
195     y += fe * -2.12194440e-4;
196 
197     y -= 0.5 * x*x;            /* y - 0.5 x^2 */
198     VecType z  = x + y;        /* ... + x  */
199 
200     return z + 0.693359375 * fe;
201 }
202 
203 
204 /* exp function for vec_tanh_float. similar to vec_exp_tanh, but without boundary checks */
205 template <typename VecType>
vec_exp_tanh_float(VecType const & arg)206 always_inline VecType vec_exp_tanh_float(VecType const & arg)
207 {
208     typedef typename VecType::int_vec int_vec;
209 
210     /* Express e**x = e**g 2**n
211      *   = e**g e**( n loge(2) )
212      *   = e**( g + n loge(2) )
213      */
214 
215     // black magic
216     VecType x = arg;
217     VecType z = round(VecType(1.44269504088896341f) * x);
218     int_vec n = z.truncate_to_int();
219     x -= z*VecType(0.693359375f);
220     x -= z*VecType(-2.12194440e-4f);
221 
222     /* Theoretical peak relative error in [-0.5, +0.5] is 3.5e-8. */
223     VecType p = 1.f +
224         x * (1.00000035762786865234375f +
225         x * (0.4999996721744537353515625f +
226         x * (0.16665561497211456298828125f +
227         x * (4.167006909847259521484375e-2f +
228         x * (8.420792408287525177001953125e-3f +
229         x * 1.386119984090328216552734375e-3f)))));
230 
231     /* multiply by power of 2 */
232     VecType approx = ldexp_float(p, n);
233 
234     return approx;
235 }
236 
237 
238 /* adapted from Julien Pommier's sse_mathfun.h, itself based on cephes */
239 template <typename VecType>
vec_sin_float(VecType const & arg)240 always_inline VecType vec_sin_float(VecType const & arg)
241 {
242     typedef typename VecType::int_vec int_vec;
243 
244     const typename VecType::float_type four_over_pi = 1.27323954473516268615107010698011489627567716592367;
245 
246     VecType sign = arg & VecType::gen_sign_mask();
247     VecType abs_arg = arg & VecType::gen_abs_mask();
248 
249     VecType y = abs_arg * four_over_pi;
250 
251     int_vec j = y.truncate_to_int();
252 
253     /* cephes: j=(j+1) & (~1) */
254     j = (j + int_vec(1)) & int_vec(~1);
255     y = j.convert_to_float();
256 
257     /* sign based on quadrant */
258     VecType swap_sign_bit = slli(j & int_vec(4), 29);
259     sign = sign ^ swap_sign_bit;
260 
261     /* polynomial mask */
262     VecType poly_mask = VecType (mask_eq(j & int_vec(2), int_vec(0)));
263 
264     /* black magic */
265     static float DP1 = 0.78515625;
266     static float DP2 = 2.4187564849853515625e-4;
267     static float DP3 = 3.77489497744594108e-8;
268     VecType base = ((abs_arg - y * DP1) - y * DP2) - y * DP3;
269 
270     /* [0..pi/4] */
271     VecType z = base * base;
272     VecType p1 = ((  2.443315711809948E-005 * z
273         - 1.388731625493765E-003) * z
274         + 4.166664568298827E-002) * z * z
275         -0.5f * z + 1.0
276         ;
277 
278     /* [pi/4..pi/2] */
279     VecType p2 = ((-1.9515295891E-4 * z
280          + 8.3321608736E-3) * z
281          - 1.6666654611E-1) * z * base + base;
282 
283     VecType approximation =  select(p1, p2, poly_mask);
284 
285     return approximation ^ sign;
286 }
287 
288 /* adapted from Julien Pommier's sse_mathfun.h, itself based on cephes */
289 template <typename VecType>
vec_cos_float(VecType const & arg)290 always_inline VecType vec_cos_float(VecType const & arg)
291 {
292     typedef typename VecType::int_vec int_vec;
293 
294     const typename VecType::float_type four_over_pi = 1.27323954473516268615107010698011489627567716592367;
295 
296     VecType abs_arg = arg & VecType::gen_abs_mask();
297 
298     VecType y = abs_arg * four_over_pi;
299 
300     int_vec j = y.truncate_to_int();
301 
302     /* cephes: j=(j+1) & (~1) */
303     j = (j + int_vec(1)) & int_vec(~1);
304     y = j.convert_to_float();
305 
306     /* sign based on quadrant */
307     int_vec jm2 = j - int_vec(2);
308     VecType sign = slli(andnot(jm2, int_vec(4)), 29);
309 
310     /* polynomial mask */
311     VecType poly_mask = VecType (mask_eq(jm2 & int_vec(2), int_vec(0)));
312 
313     /* black magic */
314     static float DP1 = 0.78515625;
315     static float DP2 = 2.4187564849853515625e-4;
316     static float DP3 = 3.77489497744594108e-8;
317     VecType base = ((abs_arg - y * DP1) - y * DP2) - y * DP3;
318 
319     /* [0..pi/4] */
320     VecType z = base * base;
321     VecType p1 = ((  2.443315711809948E-005 * z
322         - 1.388731625493765E-003) * z
323         + 4.166664568298827E-002) * z * z
324         -0.5f * z + 1
325         ;
326 
327     /* [pi/4..pi/2] */
328     VecType p2 = ((-1.9515295891E-4 * z
329          + 8.3321608736E-3) * z
330          - 1.6666654611E-1) * z * base + base;
331 
332     VecType approximation =  select(p1, p2, poly_mask);
333 
334     return approximation ^ sign;
335 }
336 
337 /* adapted from cephes, approximation polynomial generted by sollya */
338 template <typename VecType>
vec_tan_float(VecType const & arg)339 always_inline VecType vec_tan_float(VecType const & arg)
340 {
341     typedef typename VecType::int_vec int_vec;
342     const typename VecType::float_type four_over_pi = 1.27323954473516268615107010698011489627567716592367;
343 
344     VecType sign = arg & VecType::gen_sign_mask();
345     VecType abs_arg = arg & VecType::gen_abs_mask();
346 
347     VecType y = abs_arg * four_over_pi;
348     int_vec j = y.truncate_to_int();
349 
350     /* cephes: j=(j+1) & (~1) */
351     j = (j + int_vec(1)) & int_vec(~1);
352     y = j.convert_to_float();
353 
354     /* approximation mask */
355     VecType poly_mask = VecType (mask_eq(j & int_vec(2), int_vec(0)));
356 
357     /* black magic */
358     static float DP1 = 0.78515625;
359     static float DP2 = 2.4187564849853515625e-4;
360     static float DP3 = 3.77489497744594108e-8;
361     VecType base = ((abs_arg - y * DP1) - y * DP2) - y * DP3;
362 
363     VecType x = base; VecType x2 = x*x;
364 
365     // sollya: fpminimax(tan(x), [|3,5,7,9,11,13|], [|24...|], [-pi/4,pi/4], x);
366     VecType approx =
367         x + x * x2 * (0.3333315551280975341796875 + x2 * (0.1333882510662078857421875 + x2 * (5.3409568965435028076171875e-2 + x2 * (2.443529665470123291015625e-2 + x2 * (3.1127030961215496063232421875e-3 + x2 * 9.3892104923725128173828125e-3)))));
368 
369     //VecType recip = -reciprocal(approx);
370     VecType recip = -1.0 / approx;
371 
372     VecType approximation = select(recip, approx, poly_mask);
373 
374     return approximation ^ sign;
375 }
376 
377 /* adapted from cephes, approximation polynomial generted by sollya */
378 template <typename VecType>
vec_asin_float(VecType const & arg)379 always_inline VecType vec_asin_float(VecType const & arg)
380 {
381     VecType abs_arg = arg & VecType::gen_abs_mask();
382     VecType sign = arg & VecType::gen_sign_mask();
383     VecType one = VecType::gen_one();
384     VecType half = VecType::gen_05();
385     VecType zero = VecType::gen_zero();
386 
387     // range redution: asin(x) = pi/2 - 2 asin( sqrt( (1-x)/2 ) ). for |arg| > 0.5
388     VecType arg_greater_05 = mask_gt(abs_arg, 0.5);
389     VecType arg_reduced_sqr = (one - abs_arg) * half;
390     VecType arg_reduced = sqrt((one - abs_arg) * half);
391     VecType approx_arg = select(abs_arg, arg_reduced, arg_greater_05);
392 
393 
394     VecType z = select(abs_arg*abs_arg, arg_reduced_sqr, arg_greater_05);
395 
396     VecType x = approx_arg; VecType x2 = x*x;
397     // sollya: fpminimax(asin(x), [|3,5,7,9,11|], [|24...|], [0.000000000000000000001,0.5], x);
398     VecType approx_poly = x + x * x2 * (0.166667520999908447265625 + x2 * (7.4953101575374603271484375e-2 + x2 * (4.54690195620059967041015625e-2 + x2 * (2.418550290167331695556640625e-2 + x2 * 4.21570129692554473876953125e-2))));
399 
400     VecType approx_poly_reduced = 1.57079637050628662109375 - approx_poly - approx_poly;
401     VecType approx = select(approx_poly, approx_poly_reduced, arg_greater_05);
402 
403     approx = approx ^ sign;
404     // |arg| > 1: return 0
405     VecType ret = select(approx, zero, mask_gt(abs_arg, one));
406     return ret;
407 }
408 
409 /* based on asin approximation:
410  *
411  * x < -0.5:        acos(x) = pi - 2.0 * asin( sqrt((1+x)/2) );
412  * -0.5 < x < 0.5   acos(x) = pi/2 - asin(x)
413  * x > 0.5          acos(x) =      2.0 * asin( sqrt((1-x)/2) ).
414  *
415  */
416 template <typename VecType>
vec_acos_float(VecType const & arg)417 always_inline VecType vec_acos_float(VecType const & arg)
418 {
419     VecType abs_arg = arg & VecType::gen_abs_mask();
420     VecType one = VecType::gen_one();
421     VecType half = VecType::gen_05();
422     VecType zero = VecType::gen_zero();
423 
424     VecType arg_greater_05 = mask_gt(abs_arg, half);
425     VecType asin_arg_greater_05 = sqrt((one - abs_arg) * half);
426 
427     VecType asin_arg = select(arg, asin_arg_greater_05, arg_greater_05);
428 
429     VecType asin = vec_asin_float(asin_arg);
430     VecType two_asin = asin + asin;
431 
432     VecType ret_m1_m05 = 3.1415927410125732421875 - two_asin;
433     VecType ret_m05_05 = 1.57079637050628662109375 - asin;
434     VecType ret_05_1 = two_asin;
435 
436     VecType ret_m05_1 = select(ret_m05_05, ret_05_1, mask_gt(arg, half));
437     VecType ret = select(ret_m1_m05, ret_m05_1, mask_gt(arg, -0.5));
438 
439     // |arg| > 1: return 0
440     ret = select(ret, zero, mask_gt(abs_arg, one));
441     return ret;
442 }
443 
444 /* adapted from cephes */
445 template <typename VecType>
vec_atan_float(VecType const & arg)446 always_inline VecType vec_atan_float(VecType const & arg)
447 {
448     const VecType sign_arg = arg & VecType::gen_sign_mask();
449     const VecType abs_arg  = arg & VecType::gen_abs_mask();
450     const VecType one      = VecType::gen_one();
451     VecType zero           = VecType::gen_zero();
452 
453     VecType arg_range0 = abs_arg;
454     VecType arg_range1 = (abs_arg - one) / (abs_arg + one);
455     VecType arg_range2 = -one / abs_arg;
456 
457     VecType offset_range0 = zero;
458     VecType offset_range1 = 0.78539816339744830961566084581987572104929234984377;
459     VecType offset_range2 = 1.57079632679489661923132169163975144209858469968754;
460 
461     VecType mask_range_01 = mask_gt(abs_arg, 0.41421356237309504880168872420969807856967187537695);
462     VecType mask_range_12 = mask_gt(abs_arg, 2.41421356237309504880168872420969807856967187537698);
463 
464     VecType approx_arg = select(arg_range0,
465                                 select(arg_range1, arg_range2, mask_range_12),
466                                 mask_range_01);
467 
468     VecType approx_offset = select(offset_range0,
469                                    select(offset_range1, offset_range2, mask_range_12),
470                                    mask_range_01);
471 
472 
473     VecType x = approx_arg;
474     VecType x2 = x*x;
475 
476     VecType approx = approx_offset +
477        x + x * x2 * (-0.333329498767852783203125 + x2 * (0.19977732002735137939453125 + x2 * (-0.1387787759304046630859375 + x2 * 8.054284751415252685546875e-2)));
478 
479     return approx ^ sign_arg;
480 }
481 
482 
483 template <typename VecType>
vec_tanh_float(VecType const & arg)484 always_inline VecType vec_tanh_float(VecType const & arg)
485 {
486     /* this order of computation (large->small->medium) seems to be the most efficient on sse*/
487 
488     const VecType sign_arg = arg & VecType::gen_sign_mask();
489     const VecType abs_arg  = arg ^ sign_arg;
490     const VecType one      = VecType::gen_one();
491     const VecType two (2.f);
492     const VecType maxlogf_2 (22.f);
493     const VecType limit_small (0.625f);
494 
495 	/* medium values */
496     const VecType result_medium_abs = one - two / (vec_exp_tanh_float(abs_arg + abs_arg) + one);
497 
498     /* large values */
499     const VecType abs_big          = mask_gt(abs_arg, maxlogf_2);
500     const VecType result_limit_abs = one;
501 
502     /* small values */
503     const VecType f1((float)-5.70498872745e-3);
504     const VecType f2((float) 2.06390887954e-2);
505     const VecType f3((float)-5.37397155531e-2);
506     const VecType f4((float) 1.33314422036e-1);
507     const VecType f5((float)-3.33332819422e-1);
508 
509     const VecType arg_sqr = abs_arg * abs_arg;
510     const VecType result_small = ((((f1 * arg_sqr
511                                      + f2) * arg_sqr
512                                     + f3) * arg_sqr
513                                    + f4) * arg_sqr
514                                   + f5) * arg_sqr * arg
515         + arg;
516 
517     const VecType abs_small = mask_lt(abs_arg, limit_small);
518 
519     /* select from large and medium branches and set sign */
520     const VecType result_lm_abs = select(result_medium_abs, result_limit_abs, abs_big);
521     const VecType result_lm = result_lm_abs | sign_arg;
522 
523     const VecType result = select(result_lm, result_small, abs_small);
524 
525     return result;
526 }
527 
528 template <typename VecType>
vec_signed_pow(VecType arg1,VecType arg2)529 always_inline VecType vec_signed_pow(VecType arg1, VecType arg2)
530 {
531     const VecType sign_arg1 = arg1 & VecType::gen_sign_mask();
532     const VecType abs_arg1  = arg1 ^ sign_arg1;
533 
534     const VecType result = pow(abs_arg1, arg2);
535 
536     return sign_arg1 | result;
537 }
538 
539 /* compute pow using exp and log. seems to be faster than table-based algorithms */
540 template <typename VecType>
vec_pow(VecType arg1,VecType arg2)541 always_inline VecType vec_pow(VecType arg1, VecType arg2)
542 {
543     const VecType zero      = VecType::gen_zero();
544     const VecType arg1_zero = mask_eq(arg1, zero);
545 
546     const VecType result = exp(arg2 * log(arg1));
547     return select(result, zero, arg1_zero);
548 }
549 
550 template <typename VecType>
vec_signed_sqrt(VecType arg)551 always_inline VecType vec_signed_sqrt(VecType arg)
552 {
553     const VecType sign_arg1 = arg & VecType::gen_sign_mask();
554     const VecType abs_arg1  = arg ^ sign_arg1;
555 
556     const VecType result = sqrt(abs_arg1);
557 
558     return sign_arg1 | result;
559 }
560 
561 template <typename VecType>
vec_log2(VecType arg)562 always_inline VecType vec_log2(VecType arg)
563 {
564     const double rlog2 = 1.0/std::log(2.0);
565     return log(arg) * VecType((typename VecType::float_type)rlog2);
566 }
567 
568 template <typename VecType>
vec_log10(VecType arg)569 always_inline VecType vec_log10(VecType arg)
570 {
571     const double rlog10 = 1.0/std::log(10.0);
572     return log(arg) * VecType((typename VecType::float_type)rlog10);
573 }
574 
575 template <typename VecType>
vec_select(VecType lhs,VecType rhs,VecType bitmask)576 always_inline VecType vec_select(VecType lhs, VecType rhs, VecType bitmask)
577 {
578     const VecType result = andnot(bitmask, lhs) | (bitmask & rhs);
579     return result;
580 }
581 
582 
583 template <typename VecType>
vec_undenormalize(VecType arg)584 always_inline VecType vec_undenormalize(VecType arg)
585 {
586     typedef typename VecType::float_type float_type;
587     const float_type min_positive_value = std::numeric_limits<float_type>::min();
588 
589     const VecType abs_arg = abs(arg);
590     const VecType abs_arg_lt_min = mask_lt(abs_arg, min_positive_value);
591     const VecType zero = VecType::gen_zero();
592 
593     const VecType result = select(arg, zero, abs_arg_lt_min);
594     return result;
595 }
596 
597 template <typename VecType>
vec_reciprocal_newton(VecType arg)598 always_inline VecType vec_reciprocal_newton(VecType arg)
599 {
600     const VecType one = VecType::gen_one();
601 
602     const VecType approx = fast_reciprocal(arg);
603 
604     // One round of Newton-Raphson refinement
605     const VecType diff = one - approx * arg;
606     const VecType result = madd(diff, approx, approx);
607 
608     return result;
609 }
610 
611 
612 }
613 }
614 
615 #undef always_inline
616 
617 #endif /* NOVA_SIMD_DETAIL_VECMATH_HPP */
618