1 // genertic vectorized math functions
2 // Copyright (C) 2010 Tim Blechmann
3 //
4 // This program is free software; you can redistribute it and/or modify
5 // it under the terms of the GNU General Public License as published by
6 // the Free Software Foundation; either version 2 of the License, or
7 // (at your option) any later version.
8 //
9 // This program is distributed in the hope that it will be useful,
10 // but WITHOUT ANY WARRANTY; without even the implied warranty of
11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 // GNU General Public License for more details.
13 //
14 // You should have received a copy of the GNU General Public License
15 // along with this program; see the file COPYING. If not, write to
16 // the Free Software Foundation, Inc., 51 Franklin Street - Fifth Floor,
17 // Boston, MA 02110-1301, USA.
18
19 #ifndef NOVA_SIMD_DETAIL_VECMATH_HPP
20 #define NOVA_SIMD_DETAIL_VECMATH_HPP
21
22 #include <cmath>
23
24 #if defined(__GNUC__) && defined(NDEBUG)
25 #define always_inline inline __attribute__((always_inline))
26 #else
27 #define always_inline inline
28 #endif
29 #include <limits>
30
31 /* the approximation of mathematical functions should have a maximum relative error of below 5e-7f
32 */
33
34 namespace nova
35 {
36 namespace detail
37 {
38
39 template <typename VecType>
vec_sign(VecType const & arg)40 always_inline VecType vec_sign(VecType const & arg)
41 {
42 typedef VecType vec;
43 const vec zero = vec::gen_zero();
44 const vec one = vec::gen_one();
45 const vec sign_mask = vec::gen_sign_mask();
46
47 const vec nonzero = mask_neq(arg, zero);
48 const vec sign = arg & sign_mask;
49
50 const vec abs_ret = nonzero & one;
51 return sign | abs_ret;
52 }
53
54 template <typename VecType>
vec_round_float(VecType const & arg)55 always_inline VecType vec_round_float(VecType const & arg)
56 {
57 typedef VecType vec;
58
59 const vec sign = arg & vec::gen_sign_mask();
60 const vec abs_arg = sign ^ arg;
61 const vec two_to_23_ps (0x1.0p23f);
62 const vec rounded = (abs_arg + two_to_23_ps) - two_to_23_ps;
63
64 return sign ^ rounded;
65 }
66
67 template <typename VecType>
vec_floor_float(VecType const & arg)68 always_inline VecType vec_floor_float(VecType const & arg)
69 {
70 typedef VecType vec;
71
72 const vec rounded = vec_round_float(arg);
73
74 const vec rounded_larger = mask_gt(rounded, arg);
75 const vec add = rounded_larger & vec::gen_one();
76 return rounded - add;
77 }
78
79 template <typename VecType>
vec_ceil_float(VecType const & arg)80 always_inline VecType vec_ceil_float(VecType const & arg)
81 {
82 typedef VecType vec;
83
84 const vec rounded = vec_round_float(arg);
85
86 const vec rounded_smaller = mask_lt(rounded, arg);
87 const vec add = rounded_smaller & vec::gen_one();
88 return rounded + add;
89 }
90
91 template <typename VecFloat>
ldexp_float(VecFloat const & x,typename VecFloat::int_vec const & n)92 always_inline VecFloat ldexp_float(VecFloat const & x, typename VecFloat::int_vec const & n)
93 {
94 typedef typename VecFloat::int_vec int_vec;
95
96 const VecFloat exponent_mask = VecFloat::gen_exp_mask();
97 const VecFloat exponent = exponent_mask & x;
98 const VecFloat x_wo_x = andnot(exponent_mask, x); // clear exponent
99
100 int_vec new_exp = slli(n, 16+7) + int_vec(exponent); // new exponent
101 VecFloat new_exp_float(new_exp);
102 VecFloat ret = x_wo_x | new_exp_float;
103 return ret;
104 }
105
106 template <typename VecFloat>
frexp_float(VecFloat const & x,typename VecFloat::int_vec & exp)107 always_inline VecFloat frexp_float(VecFloat const & x, typename VecFloat::int_vec & exp)
108 {
109 typedef typename VecFloat::int_vec int_vec;
110
111 const VecFloat exponent_mask = VecFloat::gen_exp_mask();
112 const VecFloat exponent = exponent_mask & x;
113 const VecFloat x_wo_x = andnot(exponent_mask, x); // clear exponent
114
115 const int_vec exp_int(exponent);
116
117 exp = srli(exp_int, 16+7) - int_vec(126);
118 return x_wo_x | VecFloat::gen_exp_mask_1();
119 }
120
121 /* adapted from cephes, approximation polynomial generated by sollya */
122 template <typename VecType>
vec_exp_float(VecType const & arg)123 always_inline VecType vec_exp_float(VecType const & arg)
124 {
125 typedef typename VecType::int_vec int_vec;
126
127 /* Express e**x = e**g 2**n
128 * = e**g e**( n loge(2) )
129 * = e**( g + n loge(2) )
130 */
131
132 // black magic
133 VecType x = arg;
134 VecType z = round(VecType(1.44269504088896341f) * x);
135 int_vec n = z.truncate_to_int();
136 x -= z*VecType(0.693359375f);
137 x -= z*VecType(-2.12194440e-4f);
138
139 /* Theoretical peak relative error in [-0.5, +0.5] is 3.5e-8. */
140 VecType p = VecType(VecType::gen_one()) +
141 x * (1.00000035762786865234375f +
142 x * (0.4999996721744537353515625f +
143 x * (0.16665561497211456298828125f +
144 x * (4.167006909847259521484375e-2f +
145 x * (8.420792408287525177001953125e-3f +
146 x * 1.386119984090328216552734375e-3f)))));
147
148 /* multiply by power of 2 */
149 VecType approx = ldexp_float(p, n);
150
151 /* handle min/max boundaries */
152 const VecType maxlogf(88.72283905206835f);
153 const VecType minlogf(-88.0f);
154 const VecType max_float(std::numeric_limits<float>::max());
155 const VecType zero = VecType::gen_zero();
156
157 VecType too_large = mask_gt(arg, maxlogf);
158 VecType too_small = mask_lt(arg, minlogf);
159
160 VecType ret = select(approx, max_float, too_large);
161 ret = select(ret, zero, too_small);
162
163 return ret;
164 }
165
166 /* adapted from cephes */
167 template <typename VecType>
vec_log_float(VecType x)168 always_inline VecType vec_log_float(VecType x)
169 {
170 typedef typename VecType::int_vec int_vec;
171
172 int_vec e;
173 x = frexp_float( x, e );
174
175 const VecType sqrt_05 = 0.707106781186547524f;
176 const VecType x_smaller_sqrt_05 = mask_lt(x, sqrt_05);
177 e = e + int_vec(x_smaller_sqrt_05);
178 VecType x_add = x;
179 x_add = x_add & x_smaller_sqrt_05;
180 x += x_add - VecType(VecType::gen_one());
181
182 VecType y =
183 (((((((( 7.0376836292E-2 * x
184 - 1.1514610310E-1) * x
185 + 1.1676998740E-1) * x
186 - 1.2420140846E-1) * x
187 + 1.4249322787E-1) * x
188 - 1.6668057665E-1) * x
189 + 2.0000714765E-1) * x
190 - 2.4999993993E-1) * x
191 + 3.3333331174E-1) * x * x*x;
192
193 VecType fe = e.convert_to_float();
194 y += fe * -2.12194440e-4;
195
196 y -= 0.5 * x*x; /* y - 0.5 x^2 */
197 VecType z = x + y; /* ... + x */
198
199 return z + 0.693359375 * fe;
200 }
201
202
203 /* exp function for vec_tanh_float. similar to vec_exp_tanh, but without boundary checks */
204 template <typename VecType>
vec_exp_tanh_float(VecType const & arg)205 always_inline VecType vec_exp_tanh_float(VecType const & arg)
206 {
207 typedef typename VecType::int_vec int_vec;
208
209 /* Express e**x = e**g 2**n
210 * = e**g e**( n loge(2) )
211 * = e**( g + n loge(2) )
212 */
213
214 // black magic
215 VecType x = arg;
216 VecType z = round(VecType(1.44269504088896341f) * x);
217 int_vec n = z.truncate_to_int();
218 x -= z*VecType(0.693359375f);
219 x -= z*VecType(-2.12194440e-4f);
220
221 /* Theoretical peak relative error in [-0.5, +0.5] is 3.5e-8. */
222 VecType p = 1.f +
223 x * (1.00000035762786865234375f +
224 x * (0.4999996721744537353515625f +
225 x * (0.16665561497211456298828125f +
226 x * (4.167006909847259521484375e-2f +
227 x * (8.420792408287525177001953125e-3f +
228 x * 1.386119984090328216552734375e-3f)))));
229
230 /* multiply by power of 2 */
231 VecType approx = ldexp_float(p, n);
232
233 return approx;
234 }
235
236
237 /* adapted from Julien Pommier's sse_mathfun.h, itself based on cephes */
238 template <typename VecType>
vec_sin_float(VecType const & arg)239 always_inline VecType vec_sin_float(VecType const & arg)
240 {
241 typedef typename VecType::int_vec int_vec;
242
243 const typename VecType::float_type four_over_pi = 1.27323954473516268615107010698011489627567716592367;
244
245 VecType sign = arg & VecType::gen_sign_mask();
246 VecType abs_arg = arg & VecType::gen_abs_mask();
247
248 VecType y = abs_arg * four_over_pi;
249
250 int_vec j = y.truncate_to_int();
251
252 /* cephes: j=(j+1) & (~1) */
253 j = (j + int_vec(1)) & int_vec(~1);
254 y = j.convert_to_float();
255
256 /* sign based on quadrant */
257 VecType swap_sign_bit = slli(j & int_vec(4), 29);
258 sign = sign ^ swap_sign_bit;
259
260 /* polynomial mask */
261 VecType poly_mask = VecType (mask_eq(j & int_vec(2), int_vec(0)));
262
263 /* black magic */
264 static float DP1 = 0.78515625;
265 static float DP2 = 2.4187564849853515625e-4;
266 static float DP3 = 3.77489497744594108e-8;
267 VecType base = ((abs_arg - y * DP1) - y * DP2) - y * DP3;
268
269 /* [0..pi/4] */
270 VecType z = base * base;
271 VecType p1 = (( 2.443315711809948E-005 * z
272 - 1.388731625493765E-003) * z
273 + 4.166664568298827E-002) * z * z
274 -0.5f * z + 1.0
275 ;
276
277 /* [pi/4..pi/2] */
278 VecType p2 = ((-1.9515295891E-4 * z
279 + 8.3321608736E-3) * z
280 - 1.6666654611E-1) * z * base + base;
281
282 VecType approximation = select(p1, p2, poly_mask);
283
284 return approximation ^ sign;
285 }
286
287 /* adapted from Julien Pommier's sse_mathfun.h, itself based on cephes */
288 template <typename VecType>
vec_cos_float(VecType const & arg)289 always_inline VecType vec_cos_float(VecType const & arg)
290 {
291 typedef typename VecType::int_vec int_vec;
292
293 const typename VecType::float_type four_over_pi = 1.27323954473516268615107010698011489627567716592367;
294
295 VecType abs_arg = arg & VecType::gen_abs_mask();
296
297 VecType y = abs_arg * four_over_pi;
298
299 int_vec j = y.truncate_to_int();
300
301 /* cephes: j=(j+1) & (~1) */
302 j = (j + int_vec(1)) & int_vec(~1);
303 y = j.convert_to_float();
304
305 /* sign based on quadrant */
306 int_vec jm2 = j - int_vec(2);
307 VecType sign = slli(andnot(jm2, int_vec(4)), 29);
308
309 /* polynomial mask */
310 VecType poly_mask = VecType (mask_eq(jm2 & int_vec(2), int_vec(0)));
311
312 /* black magic */
313 static float DP1 = 0.78515625;
314 static float DP2 = 2.4187564849853515625e-4;
315 static float DP3 = 3.77489497744594108e-8;
316 VecType base = ((abs_arg - y * DP1) - y * DP2) - y * DP3;
317
318 /* [0..pi/4] */
319 VecType z = base * base;
320 VecType p1 = (( 2.443315711809948E-005 * z
321 - 1.388731625493765E-003) * z
322 + 4.166664568298827E-002) * z * z
323 -0.5f * z + 1
324 ;
325
326 /* [pi/4..pi/2] */
327 VecType p2 = ((-1.9515295891E-4 * z
328 + 8.3321608736E-3) * z
329 - 1.6666654611E-1) * z * base + base;
330
331 VecType approximation = select(p1, p2, poly_mask);
332
333 return approximation ^ sign;
334 }
335
336 /* adapted from cephes, approximation polynomial generted by sollya */
337 template <typename VecType>
vec_tan_float(VecType const & arg)338 always_inline VecType vec_tan_float(VecType const & arg)
339 {
340 typedef typename VecType::int_vec int_vec;
341 const typename VecType::float_type four_over_pi = 1.27323954473516268615107010698011489627567716592367;
342
343 VecType sign = arg & VecType::gen_sign_mask();
344 VecType abs_arg = arg & VecType::gen_abs_mask();
345
346 VecType y = abs_arg * four_over_pi;
347 int_vec j = y.truncate_to_int();
348
349 /* cephes: j=(j+1) & (~1) */
350 j = (j + int_vec(1)) & int_vec(~1);
351 y = j.convert_to_float();
352
353 /* approximation mask */
354 VecType poly_mask = VecType (mask_eq(j & int_vec(2), int_vec(0)));
355
356 /* black magic */
357 static float DP1 = 0.78515625;
358 static float DP2 = 2.4187564849853515625e-4;
359 static float DP3 = 3.77489497744594108e-8;
360 VecType base = ((abs_arg - y * DP1) - y * DP2) - y * DP3;
361
362 VecType x = base; VecType x2 = x*x;
363
364 // sollya: fpminimax(tan(x), [|3,5,7,9,11,13|], [|24...|], [-pi/4,pi/4], x);
365 VecType approx =
366 x + x * x2 * (0.3333315551280975341796875 + x2 * (0.1333882510662078857421875 + x2 * (5.3409568965435028076171875e-2 + x2 * (2.443529665470123291015625e-2 + x2 * (3.1127030961215496063232421875e-3 + x2 * 9.3892104923725128173828125e-3)))));
367
368 //VecType recip = -reciprocal(approx);
369 VecType recip = -1.0 / approx;
370
371 VecType approximation = select(recip, approx, poly_mask);
372
373 return approximation ^ sign;
374 }
375
376 /* adapted from cephes, approximation polynomial generted by sollya */
377 template <typename VecType>
vec_asin_float(VecType const & arg)378 always_inline VecType vec_asin_float(VecType const & arg)
379 {
380 VecType abs_arg = arg & VecType::gen_abs_mask();
381 VecType sign = arg & VecType::gen_sign_mask();
382 VecType one = VecType::gen_one();
383 VecType half = VecType::gen_05();
384 VecType zero = VecType::gen_zero();
385
386 // range redution: asin(x) = pi/2 - 2 asin( sqrt( (1-x)/2 ) ). for |arg| > 0.5
387 VecType arg_greater_05 = mask_gt(abs_arg, 0.5);
388 // VecType arg_reduced_sqr = (one - abs_arg) * half;
389 VecType arg_reduced = sqrt((one - abs_arg) * half);
390 VecType approx_arg = select(abs_arg, arg_reduced, arg_greater_05);
391
392
393 // VecType z = select(abs_arg*abs_arg, arg_reduced_sqr, arg_greater_05);
394
395 VecType x = approx_arg; VecType x2 = x*x;
396 // sollya: fpminimax(asin(x), [|3,5,7,9,11|], [|24...|], [0.000000000000000000001,0.5], x);
397 VecType approx_poly = x + x * x2 * (0.166667520999908447265625 + x2 * (7.4953101575374603271484375e-2 + x2 * (4.54690195620059967041015625e-2 + x2 * (2.418550290167331695556640625e-2 + x2 * 4.21570129692554473876953125e-2))));
398
399 VecType approx_poly_reduced = 1.57079637050628662109375 - approx_poly - approx_poly;
400 VecType approx = select(approx_poly, approx_poly_reduced, arg_greater_05);
401
402 approx = approx ^ sign;
403 // |arg| > 1: return 0
404 VecType ret = select(approx, zero, mask_gt(abs_arg, one));
405 return ret;
406 }
407
408 /* based on asin approximation:
409 *
410 * x < -0.5: acos(x) = pi - 2.0 * asin( sqrt((1+x)/2) );
411 * -0.5 < x < 0.5 acos(x) = pi/2 - asin(x)
412 * x > 0.5 acos(x) = 2.0 * asin( sqrt((1-x)/2) ).
413 *
414 */
415 template <typename VecType>
vec_acos_float(VecType const & arg)416 always_inline VecType vec_acos_float(VecType const & arg)
417 {
418 VecType abs_arg = arg & VecType::gen_abs_mask();
419 VecType one = VecType::gen_one();
420 VecType half = VecType::gen_05();
421 VecType zero = VecType::gen_zero();
422
423 VecType arg_greater_05 = mask_gt(abs_arg, half);
424 VecType asin_arg_greater_05 = sqrt((one - abs_arg) * half);
425
426 VecType asin_arg = select(arg, asin_arg_greater_05, arg_greater_05);
427
428 VecType asin = vec_asin_float(asin_arg);
429 VecType two_asin = asin + asin;
430
431 VecType ret_m1_m05 = 3.1415927410125732421875 - two_asin;
432 VecType ret_m05_05 = 1.57079637050628662109375 - asin;
433 VecType ret_05_1 = two_asin;
434
435 VecType ret_m05_1 = select(ret_m05_05, ret_05_1, mask_gt(arg, half));
436 VecType ret = select(ret_m1_m05, ret_m05_1, mask_gt(arg, -0.5));
437
438 // |arg| > 1: return 0
439 ret = select(ret, zero, mask_gt(abs_arg, one));
440 return ret;
441 }
442
443 /* adapted from cephes */
444 template <typename VecType>
vec_atan_float(VecType const & arg)445 always_inline VecType vec_atan_float(VecType const & arg)
446 {
447 const VecType sign_arg = arg & VecType::gen_sign_mask();
448 const VecType abs_arg = arg & VecType::gen_abs_mask();
449 const VecType one = VecType::gen_one();
450 VecType zero = VecType::gen_zero();
451
452 VecType arg_range0 = abs_arg;
453 VecType arg_range1 = (abs_arg - one) / (abs_arg + one);
454 VecType arg_range2 = -one / abs_arg;
455
456 VecType offset_range0 = zero;
457 VecType offset_range1 = 0.78539816339744830961566084581987572104929234984377;
458 VecType offset_range2 = 1.57079632679489661923132169163975144209858469968754;
459
460 VecType mask_range_01 = mask_gt(abs_arg, 0.41421356237309504880168872420969807856967187537695);
461 VecType mask_range_12 = mask_gt(abs_arg, 2.41421356237309504880168872420969807856967187537698);
462
463 VecType approx_arg = select(arg_range0,
464 select(arg_range1, arg_range2, mask_range_12),
465 mask_range_01);
466
467 VecType approx_offset = select(offset_range0,
468 select(offset_range1, offset_range2, mask_range_12),
469 mask_range_01);
470
471
472 VecType x = approx_arg;
473 VecType x2 = x*x;
474
475 VecType approx = approx_offset +
476 x + x * x2 * (-0.333329498767852783203125 + x2 * (0.19977732002735137939453125 + x2 * (-0.1387787759304046630859375 + x2 * 8.054284751415252685546875e-2)));
477
478 return approx ^ sign_arg;
479 }
480
481
482 template <typename VecType>
vec_tanh_float(VecType const & arg)483 always_inline VecType vec_tanh_float(VecType const & arg)
484 {
485 /* this order of computation (large->small->medium) seems to be the most efficient on sse*/
486
487 const VecType sign_arg = arg & VecType::gen_sign_mask();
488 const VecType abs_arg = arg ^ sign_arg;
489 const VecType one = VecType::gen_one();
490 const VecType two (2.f);
491 const VecType maxlogf_2 (22.f);
492 const VecType limit_small (0.625f);
493
494 /* medium values */
495 const VecType result_medium_abs = one - two / (vec_exp_tanh_float(abs_arg + abs_arg) + one);
496
497 /* large values */
498 const VecType abs_big = mask_gt(abs_arg, maxlogf_2);
499 const VecType result_limit_abs = one;
500
501 /* small values */
502 const VecType f1((float)-5.70498872745e-3);
503 const VecType f2((float) 2.06390887954e-2);
504 const VecType f3((float)-5.37397155531e-2);
505 const VecType f4((float) 1.33314422036e-1);
506 const VecType f5((float)-3.33332819422e-1);
507
508 const VecType arg_sqr = abs_arg * abs_arg;
509 const VecType result_small = ((((f1 * arg_sqr
510 + f2) * arg_sqr
511 + f3) * arg_sqr
512 + f4) * arg_sqr
513 + f5) * arg_sqr * arg
514 + arg;
515
516 const VecType abs_small = mask_lt(abs_arg, limit_small);
517
518 /* select from large and medium branches and set sign */
519 const VecType result_lm_abs = select(result_medium_abs, result_limit_abs, abs_big);
520 const VecType result_lm = result_lm_abs | sign_arg;
521
522 const VecType result = select(result_lm, result_small, abs_small);
523
524 return result;
525 }
526
527 template <typename VecType>
vec_signed_pow(VecType arg1,VecType arg2)528 always_inline VecType vec_signed_pow(VecType arg1, VecType arg2)
529 {
530 const VecType sign_arg1 = arg1 & VecType::gen_sign_mask();
531 const VecType abs_arg1 = arg1 ^ sign_arg1;
532
533 const VecType result = pow(abs_arg1, arg2);
534
535 return sign_arg1 | result;
536 }
537
538 /* compute pow using exp and log. seems to be faster than table-based algorithms */
539 template <typename VecType>
vec_pow(VecType arg1,VecType arg2)540 always_inline VecType vec_pow(VecType arg1, VecType arg2)
541 {
542 const VecType zero = VecType::gen_zero();
543 const VecType arg1_zero = mask_eq(arg1, zero);
544
545 const VecType result = exp(arg2 * log(arg1));
546 return select(result, zero, arg1_zero);
547 }
548
549 template <typename VecType>
vec_signed_sqrt(VecType arg)550 always_inline VecType vec_signed_sqrt(VecType arg)
551 {
552 const VecType sign_arg1 = arg & VecType::gen_sign_mask();
553 const VecType abs_arg1 = arg ^ sign_arg1;
554
555 const VecType result = sqrt(abs_arg1);
556
557 return sign_arg1 | result;
558 }
559
560 template <typename VecType>
vec_log2(VecType arg)561 always_inline VecType vec_log2(VecType arg)
562 {
563 const double rlog2 = 1.0/std::log(2.0);
564 return log(arg) * VecType((typename VecType::float_type)rlog2);
565 }
566
567 template <typename VecType>
vec_log10(VecType arg)568 always_inline VecType vec_log10(VecType arg)
569 {
570 const double rlog10 = 1.0/std::log(10.0);
571 return log(arg) * VecType((typename VecType::float_type)rlog10);
572 }
573
574 template <typename VecType>
vec_select(VecType lhs,VecType rhs,VecType bitmask)575 always_inline VecType vec_select(VecType lhs, VecType rhs, VecType bitmask)
576 {
577 const VecType result = andnot(bitmask, lhs) | (bitmask & rhs);
578 return result;
579 }
580
581
582 template <typename VecType>
vec_undenormalize(VecType arg)583 always_inline VecType vec_undenormalize(VecType arg)
584 {
585 typedef typename VecType::float_type float_type;
586 const float_type min_positive_value = std::numeric_limits<float_type>::min();
587
588 const VecType abs_arg = abs(arg);
589 const VecType abs_arg_lt_min = mask_lt(abs_arg, min_positive_value);
590 const VecType zero = VecType::gen_zero();
591
592 const VecType result = select(arg, zero, abs_arg_lt_min);
593 return result;
594 }
595
596 template <typename VecType>
vec_reciprocal_newton(VecType arg)597 always_inline VecType vec_reciprocal_newton(VecType arg)
598 {
599 const VecType one = VecType::gen_one();
600
601 const VecType approx = fast_reciprocal(arg);
602
603 // One round of Newton-Raphson refinement
604 const VecType diff = one - approx * arg;
605 const VecType result = madd(diff, approx, approx);
606
607 return result;
608 }
609
610
611 }
612 }
613
614 #undef always_inline
615
616 #endif /* NOVA_SIMD_DETAIL_VECMATH_HPP */
617