1 // genertic vectorized math functions
2 // Copyright (C) 2010 Tim Blechmann
3 //
4 // This program is free software; you can redistribute it and/or modify
5 // it under the terms of the GNU General Public License as published by
6 // the Free Software Foundation; either version 2 of the License, or
7 // (at your option) any later version.
8 //
9 // This program is distributed in the hope that it will be useful,
10 // but WITHOUT ANY WARRANTY; without even the implied warranty of
11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 // GNU General Public License for more details.
13 //
14 // You should have received a copy of the GNU General Public License
15 // along with this program; see the file COPYING. If not, write to
16 // the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
17 // Boston, MA 02111-1307, USA.
18
19 #ifndef NOVA_SIMD_DETAIL_VECMATH_HPP
20 #define NOVA_SIMD_DETAIL_VECMATH_HPP
21
22 #include <cmath>
23
24 #if defined(__GNUC__) && defined(NDEBUG)
25 #define always_inline inline __attribute__((always_inline))
26 #else
27 #define always_inline inline
28 #endif
29 #include <limits>
30
31 /* the approximation of mathematical functions should have a maximum relative error of below 5e-7f
32 */
33
34 namespace nova
35 {
36 namespace detail
37 {
38
39 template <typename VecType>
vec_sign(VecType const & arg)40 always_inline VecType vec_sign(VecType const & arg)
41 {
42 typedef VecType vec;
43 const vec zero = vec::gen_zero();
44 const vec one = vec::gen_one();
45 const vec sign_mask = vec::gen_sign_mask();
46
47 const vec nonzero = mask_neq(arg, zero);
48 const vec sign = arg & sign_mask;
49
50 const vec abs_ret = nonzero & one;
51 return sign | abs_ret;
52 }
53
54 template <typename VecType>
vec_round_float(VecType const & arg)55 always_inline VecType vec_round_float(VecType const & arg)
56 {
57 typedef VecType vec;
58
59 const vec sign = arg & vec::gen_sign_mask();
60 const vec abs_arg = sign ^ arg;
61 const vec two_to_23_ps (0x1.0p23f);
62 const vec rounded = (abs_arg + two_to_23_ps) - two_to_23_ps;
63
64 return sign ^ rounded;
65 }
66
67 template <typename VecType>
vec_floor_float(VecType const & arg)68 always_inline VecType vec_floor_float(VecType const & arg)
69 {
70 typedef VecType vec;
71
72 const vec rounded = vec_round_float(arg);
73
74 const vec rounded_larger = mask_gt(rounded, arg);
75 const vec add = rounded_larger & vec::gen_one();
76 return rounded - add;
77 }
78
79 template <typename VecType>
vec_ceil_float(VecType const & arg)80 always_inline VecType vec_ceil_float(VecType const & arg)
81 {
82 typedef VecType vec;
83
84 const vec rounded = vec_round_float(arg);
85
86 const vec rounded_smaller = mask_lt(rounded, arg);
87 const vec add = rounded_smaller & vec::gen_one();
88 return rounded + add;
89 }
90
91 template <typename VecFloat>
ldexp_float(VecFloat const & x,typename VecFloat::int_vec const & n)92 always_inline VecFloat ldexp_float(VecFloat const & x, typename VecFloat::int_vec const & n)
93 {
94 typedef typename VecFloat::int_vec int_vec;
95
96 const VecFloat exponent_mask = VecFloat::gen_exp_mask();
97 const VecFloat exponent = exponent_mask & x;
98 const VecFloat x_wo_x = andnot(exponent_mask, x); // clear exponent
99
100 int_vec new_exp = slli(n, 16+7) + int_vec(exponent); // new exponent
101 VecFloat new_exp_float(new_exp);
102 VecFloat ret = x_wo_x | new_exp_float;
103 return ret;
104 }
105
106 template <typename VecFloat>
frexp_float(VecFloat const & x,typename VecFloat::int_vec & exp)107 always_inline VecFloat frexp_float(VecFloat const & x, typename VecFloat::int_vec & exp)
108 {
109 typedef typename VecFloat::int_vec int_vec;
110
111 const VecFloat exponent_mask = VecFloat::gen_exp_mask();
112 const VecFloat exponent = exponent_mask & x;
113 const VecFloat x_wo_x = andnot(exponent_mask, x); // clear exponent
114
115 const int_vec exp_int(exponent);
116
117 exp = srli(exp_int, 16+7) - int_vec(126);
118 return x_wo_x | VecFloat::gen_exp_mask_1();
119 }
120
121 /* adapted from cephes, approximation polynomial generated by sollya */
122 template <typename VecType>
vec_exp_float(VecType const & arg)123 always_inline VecType vec_exp_float(VecType const & arg)
124 {
125 typedef typename VecType::int_vec int_vec;
126
127 /* Express e**x = e**g 2**n
128 * = e**g e**( n loge(2) )
129 * = e**( g + n loge(2) )
130 */
131
132 // black magic
133 VecType x = arg;
134 VecType z = round(VecType(1.44269504088896341f) * x);
135 int_vec n = z.truncate_to_int();
136 x -= z*VecType(0.693359375f);
137 x -= z*VecType(-2.12194440e-4f);
138
139 /* Theoretical peak relative error in [-0.5, +0.5] is 3.5e-8. */
140 VecType p = VecType(VecType::gen_one()) +
141 x * (1.00000035762786865234375f +
142 x * (0.4999996721744537353515625f +
143 x * (0.16665561497211456298828125f +
144 x * (4.167006909847259521484375e-2f +
145 x * (8.420792408287525177001953125e-3f +
146 x * 1.386119984090328216552734375e-3f)))));
147
148 /* multiply by power of 2 */
149 VecType approx = ldexp_float(p, n);
150
151 /* handle min/max boundaries */
152 const VecType maxlogf(88.72283905206835f);
153 // const VecType minlogf(-103.278929903431851103f);
154 const VecType minlogf = -maxlogf;
155 const VecType max_float(std::numeric_limits<float>::max());
156 const VecType zero = VecType::gen_zero();
157
158 VecType too_large = mask_gt(arg, maxlogf);
159 VecType too_small = mask_lt(arg, minlogf);
160
161 VecType ret = select(approx, max_float, too_large);
162 ret = select(ret, zero, too_small);
163
164 return ret;
165 }
166
167 /* adapted from cephes */
168 template <typename VecType>
vec_log_float(VecType x)169 always_inline VecType vec_log_float(VecType x)
170 {
171 typedef typename VecType::int_vec int_vec;
172
173 int_vec e;
174 x = frexp_float( x, e );
175
176 const VecType sqrt_05 = 0.707106781186547524f;
177 const VecType x_smaller_sqrt_05 = mask_lt(x, sqrt_05);
178 e = e + int_vec(x_smaller_sqrt_05);
179 VecType x_add = x;
180 x_add = x_add & x_smaller_sqrt_05;
181 x += x_add - VecType(VecType::gen_one());
182
183 VecType y =
184 (((((((( 7.0376836292E-2 * x
185 - 1.1514610310E-1) * x
186 + 1.1676998740E-1) * x
187 - 1.2420140846E-1) * x
188 + 1.4249322787E-1) * x
189 - 1.6668057665E-1) * x
190 + 2.0000714765E-1) * x
191 - 2.4999993993E-1) * x
192 + 3.3333331174E-1) * x * x*x;
193
194 VecType fe = e.convert_to_float();
195 y += fe * -2.12194440e-4;
196
197 y -= 0.5 * x*x; /* y - 0.5 x^2 */
198 VecType z = x + y; /* ... + x */
199
200 return z + 0.693359375 * fe;
201 }
202
203
204 /* exp function for vec_tanh_float. similar to vec_exp_tanh, but without boundary checks */
205 template <typename VecType>
vec_exp_tanh_float(VecType const & arg)206 always_inline VecType vec_exp_tanh_float(VecType const & arg)
207 {
208 typedef typename VecType::int_vec int_vec;
209
210 /* Express e**x = e**g 2**n
211 * = e**g e**( n loge(2) )
212 * = e**( g + n loge(2) )
213 */
214
215 // black magic
216 VecType x = arg;
217 VecType z = round(VecType(1.44269504088896341f) * x);
218 int_vec n = z.truncate_to_int();
219 x -= z*VecType(0.693359375f);
220 x -= z*VecType(-2.12194440e-4f);
221
222 /* Theoretical peak relative error in [-0.5, +0.5] is 3.5e-8. */
223 VecType p = 1.f +
224 x * (1.00000035762786865234375f +
225 x * (0.4999996721744537353515625f +
226 x * (0.16665561497211456298828125f +
227 x * (4.167006909847259521484375e-2f +
228 x * (8.420792408287525177001953125e-3f +
229 x * 1.386119984090328216552734375e-3f)))));
230
231 /* multiply by power of 2 */
232 VecType approx = ldexp_float(p, n);
233
234 return approx;
235 }
236
237
238 /* adapted from Julien Pommier's sse_mathfun.h, itself based on cephes */
239 template <typename VecType>
vec_sin_float(VecType const & arg)240 always_inline VecType vec_sin_float(VecType const & arg)
241 {
242 typedef typename VecType::int_vec int_vec;
243
244 const typename VecType::float_type four_over_pi = 1.27323954473516268615107010698011489627567716592367;
245
246 VecType sign = arg & VecType::gen_sign_mask();
247 VecType abs_arg = arg & VecType::gen_abs_mask();
248
249 VecType y = abs_arg * four_over_pi;
250
251 int_vec j = y.truncate_to_int();
252
253 /* cephes: j=(j+1) & (~1) */
254 j = (j + int_vec(1)) & int_vec(~1);
255 y = j.convert_to_float();
256
257 /* sign based on quadrant */
258 VecType swap_sign_bit = slli(j & int_vec(4), 29);
259 sign = sign ^ swap_sign_bit;
260
261 /* polynomial mask */
262 VecType poly_mask = VecType (mask_eq(j & int_vec(2), int_vec(0)));
263
264 /* black magic */
265 static float DP1 = 0.78515625;
266 static float DP2 = 2.4187564849853515625e-4;
267 static float DP3 = 3.77489497744594108e-8;
268 VecType base = ((abs_arg - y * DP1) - y * DP2) - y * DP3;
269
270 /* [0..pi/4] */
271 VecType z = base * base;
272 VecType p1 = (( 2.443315711809948E-005 * z
273 - 1.388731625493765E-003) * z
274 + 4.166664568298827E-002) * z * z
275 -0.5f * z + 1.0
276 ;
277
278 /* [pi/4..pi/2] */
279 VecType p2 = ((-1.9515295891E-4 * z
280 + 8.3321608736E-3) * z
281 - 1.6666654611E-1) * z * base + base;
282
283 VecType approximation = select(p1, p2, poly_mask);
284
285 return approximation ^ sign;
286 }
287
288 /* adapted from Julien Pommier's sse_mathfun.h, itself based on cephes */
289 template <typename VecType>
vec_cos_float(VecType const & arg)290 always_inline VecType vec_cos_float(VecType const & arg)
291 {
292 typedef typename VecType::int_vec int_vec;
293
294 const typename VecType::float_type four_over_pi = 1.27323954473516268615107010698011489627567716592367;
295
296 VecType abs_arg = arg & VecType::gen_abs_mask();
297
298 VecType y = abs_arg * four_over_pi;
299
300 int_vec j = y.truncate_to_int();
301
302 /* cephes: j=(j+1) & (~1) */
303 j = (j + int_vec(1)) & int_vec(~1);
304 y = j.convert_to_float();
305
306 /* sign based on quadrant */
307 int_vec jm2 = j - int_vec(2);
308 VecType sign = slli(andnot(jm2, int_vec(4)), 29);
309
310 /* polynomial mask */
311 VecType poly_mask = VecType (mask_eq(jm2 & int_vec(2), int_vec(0)));
312
313 /* black magic */
314 static float DP1 = 0.78515625;
315 static float DP2 = 2.4187564849853515625e-4;
316 static float DP3 = 3.77489497744594108e-8;
317 VecType base = ((abs_arg - y * DP1) - y * DP2) - y * DP3;
318
319 /* [0..pi/4] */
320 VecType z = base * base;
321 VecType p1 = (( 2.443315711809948E-005 * z
322 - 1.388731625493765E-003) * z
323 + 4.166664568298827E-002) * z * z
324 -0.5f * z + 1
325 ;
326
327 /* [pi/4..pi/2] */
328 VecType p2 = ((-1.9515295891E-4 * z
329 + 8.3321608736E-3) * z
330 - 1.6666654611E-1) * z * base + base;
331
332 VecType approximation = select(p1, p2, poly_mask);
333
334 return approximation ^ sign;
335 }
336
337 /* adapted from cephes, approximation polynomial generted by sollya */
338 template <typename VecType>
vec_tan_float(VecType const & arg)339 always_inline VecType vec_tan_float(VecType const & arg)
340 {
341 typedef typename VecType::int_vec int_vec;
342 const typename VecType::float_type four_over_pi = 1.27323954473516268615107010698011489627567716592367;
343
344 VecType sign = arg & VecType::gen_sign_mask();
345 VecType abs_arg = arg & VecType::gen_abs_mask();
346
347 VecType y = abs_arg * four_over_pi;
348 int_vec j = y.truncate_to_int();
349
350 /* cephes: j=(j+1) & (~1) */
351 j = (j + int_vec(1)) & int_vec(~1);
352 y = j.convert_to_float();
353
354 /* approximation mask */
355 VecType poly_mask = VecType (mask_eq(j & int_vec(2), int_vec(0)));
356
357 /* black magic */
358 static float DP1 = 0.78515625;
359 static float DP2 = 2.4187564849853515625e-4;
360 static float DP3 = 3.77489497744594108e-8;
361 VecType base = ((abs_arg - y * DP1) - y * DP2) - y * DP3;
362
363 VecType x = base; VecType x2 = x*x;
364
365 // sollya: fpminimax(tan(x), [|3,5,7,9,11,13|], [|24...|], [-pi/4,pi/4], x);
366 VecType approx =
367 x + x * x2 * (0.3333315551280975341796875 + x2 * (0.1333882510662078857421875 + x2 * (5.3409568965435028076171875e-2 + x2 * (2.443529665470123291015625e-2 + x2 * (3.1127030961215496063232421875e-3 + x2 * 9.3892104923725128173828125e-3)))));
368
369 //VecType recip = -reciprocal(approx);
370 VecType recip = -1.0 / approx;
371
372 VecType approximation = select(recip, approx, poly_mask);
373
374 return approximation ^ sign;
375 }
376
377 /* adapted from cephes, approximation polynomial generted by sollya */
378 template <typename VecType>
vec_asin_float(VecType const & arg)379 always_inline VecType vec_asin_float(VecType const & arg)
380 {
381 VecType abs_arg = arg & VecType::gen_abs_mask();
382 VecType sign = arg & VecType::gen_sign_mask();
383 VecType one = VecType::gen_one();
384 VecType half = VecType::gen_05();
385 VecType zero = VecType::gen_zero();
386
387 // range redution: asin(x) = pi/2 - 2 asin( sqrt( (1-x)/2 ) ). for |arg| > 0.5
388 VecType arg_greater_05 = mask_gt(abs_arg, 0.5);
389 VecType arg_reduced_sqr = (one - abs_arg) * half;
390 VecType arg_reduced = sqrt((one - abs_arg) * half);
391 VecType approx_arg = select(abs_arg, arg_reduced, arg_greater_05);
392
393
394 VecType z = select(abs_arg*abs_arg, arg_reduced_sqr, arg_greater_05);
395
396 VecType x = approx_arg; VecType x2 = x*x;
397 // sollya: fpminimax(asin(x), [|3,5,7,9,11|], [|24...|], [0.000000000000000000001,0.5], x);
398 VecType approx_poly = x + x * x2 * (0.166667520999908447265625 + x2 * (7.4953101575374603271484375e-2 + x2 * (4.54690195620059967041015625e-2 + x2 * (2.418550290167331695556640625e-2 + x2 * 4.21570129692554473876953125e-2))));
399
400 VecType approx_poly_reduced = 1.57079637050628662109375 - approx_poly - approx_poly;
401 VecType approx = select(approx_poly, approx_poly_reduced, arg_greater_05);
402
403 approx = approx ^ sign;
404 // |arg| > 1: return 0
405 VecType ret = select(approx, zero, mask_gt(abs_arg, one));
406 return ret;
407 }
408
409 /* based on asin approximation:
410 *
411 * x < -0.5: acos(x) = pi - 2.0 * asin( sqrt((1+x)/2) );
412 * -0.5 < x < 0.5 acos(x) = pi/2 - asin(x)
413 * x > 0.5 acos(x) = 2.0 * asin( sqrt((1-x)/2) ).
414 *
415 */
416 template <typename VecType>
vec_acos_float(VecType const & arg)417 always_inline VecType vec_acos_float(VecType const & arg)
418 {
419 VecType abs_arg = arg & VecType::gen_abs_mask();
420 VecType one = VecType::gen_one();
421 VecType half = VecType::gen_05();
422 VecType zero = VecType::gen_zero();
423
424 VecType arg_greater_05 = mask_gt(abs_arg, half);
425 VecType asin_arg_greater_05 = sqrt((one - abs_arg) * half);
426
427 VecType asin_arg = select(arg, asin_arg_greater_05, arg_greater_05);
428
429 VecType asin = vec_asin_float(asin_arg);
430 VecType two_asin = asin + asin;
431
432 VecType ret_m1_m05 = 3.1415927410125732421875 - two_asin;
433 VecType ret_m05_05 = 1.57079637050628662109375 - asin;
434 VecType ret_05_1 = two_asin;
435
436 VecType ret_m05_1 = select(ret_m05_05, ret_05_1, mask_gt(arg, half));
437 VecType ret = select(ret_m1_m05, ret_m05_1, mask_gt(arg, -0.5));
438
439 // |arg| > 1: return 0
440 ret = select(ret, zero, mask_gt(abs_arg, one));
441 return ret;
442 }
443
444 /* adapted from cephes */
445 template <typename VecType>
vec_atan_float(VecType const & arg)446 always_inline VecType vec_atan_float(VecType const & arg)
447 {
448 const VecType sign_arg = arg & VecType::gen_sign_mask();
449 const VecType abs_arg = arg & VecType::gen_abs_mask();
450 const VecType one = VecType::gen_one();
451 VecType zero = VecType::gen_zero();
452
453 VecType arg_range0 = abs_arg;
454 VecType arg_range1 = (abs_arg - one) / (abs_arg + one);
455 VecType arg_range2 = -one / abs_arg;
456
457 VecType offset_range0 = zero;
458 VecType offset_range1 = 0.78539816339744830961566084581987572104929234984377;
459 VecType offset_range2 = 1.57079632679489661923132169163975144209858469968754;
460
461 VecType mask_range_01 = mask_gt(abs_arg, 0.41421356237309504880168872420969807856967187537695);
462 VecType mask_range_12 = mask_gt(abs_arg, 2.41421356237309504880168872420969807856967187537698);
463
464 VecType approx_arg = select(arg_range0,
465 select(arg_range1, arg_range2, mask_range_12),
466 mask_range_01);
467
468 VecType approx_offset = select(offset_range0,
469 select(offset_range1, offset_range2, mask_range_12),
470 mask_range_01);
471
472
473 VecType x = approx_arg;
474 VecType x2 = x*x;
475
476 VecType approx = approx_offset +
477 x + x * x2 * (-0.333329498767852783203125 + x2 * (0.19977732002735137939453125 + x2 * (-0.1387787759304046630859375 + x2 * 8.054284751415252685546875e-2)));
478
479 return approx ^ sign_arg;
480 }
481
482
483 template <typename VecType>
vec_tanh_float(VecType const & arg)484 always_inline VecType vec_tanh_float(VecType const & arg)
485 {
486 /* this order of computation (large->small->medium) seems to be the most efficient on sse*/
487
488 const VecType sign_arg = arg & VecType::gen_sign_mask();
489 const VecType abs_arg = arg ^ sign_arg;
490 const VecType one = VecType::gen_one();
491 const VecType two (2.f);
492 const VecType maxlogf_2 (22.f);
493 const VecType limit_small (0.625f);
494
495 /* medium values */
496 const VecType result_medium_abs = one - two / (vec_exp_tanh_float(abs_arg + abs_arg) + one);
497
498 /* large values */
499 const VecType abs_big = mask_gt(abs_arg, maxlogf_2);
500 const VecType result_limit_abs = one;
501
502 /* small values */
503 const VecType f1((float)-5.70498872745e-3);
504 const VecType f2((float) 2.06390887954e-2);
505 const VecType f3((float)-5.37397155531e-2);
506 const VecType f4((float) 1.33314422036e-1);
507 const VecType f5((float)-3.33332819422e-1);
508
509 const VecType arg_sqr = abs_arg * abs_arg;
510 const VecType result_small = ((((f1 * arg_sqr
511 + f2) * arg_sqr
512 + f3) * arg_sqr
513 + f4) * arg_sqr
514 + f5) * arg_sqr * arg
515 + arg;
516
517 const VecType abs_small = mask_lt(abs_arg, limit_small);
518
519 /* select from large and medium branches and set sign */
520 const VecType result_lm_abs = select(result_medium_abs, result_limit_abs, abs_big);
521 const VecType result_lm = result_lm_abs | sign_arg;
522
523 const VecType result = select(result_lm, result_small, abs_small);
524
525 return result;
526 }
527
528 template <typename VecType>
vec_signed_pow(VecType arg1,VecType arg2)529 always_inline VecType vec_signed_pow(VecType arg1, VecType arg2)
530 {
531 const VecType sign_arg1 = arg1 & VecType::gen_sign_mask();
532 const VecType abs_arg1 = arg1 ^ sign_arg1;
533
534 const VecType result = pow(abs_arg1, arg2);
535
536 return sign_arg1 | result;
537 }
538
539 /* compute pow using exp and log. seems to be faster than table-based algorithms */
540 template <typename VecType>
vec_pow(VecType arg1,VecType arg2)541 always_inline VecType vec_pow(VecType arg1, VecType arg2)
542 {
543 const VecType zero = VecType::gen_zero();
544 const VecType arg1_zero = mask_eq(arg1, zero);
545
546 const VecType result = exp(arg2 * log(arg1));
547 return select(result, zero, arg1_zero);
548 }
549
550 template <typename VecType>
vec_signed_sqrt(VecType arg)551 always_inline VecType vec_signed_sqrt(VecType arg)
552 {
553 const VecType sign_arg1 = arg & VecType::gen_sign_mask();
554 const VecType abs_arg1 = arg ^ sign_arg1;
555
556 const VecType result = sqrt(abs_arg1);
557
558 return sign_arg1 | result;
559 }
560
561 template <typename VecType>
vec_log2(VecType arg)562 always_inline VecType vec_log2(VecType arg)
563 {
564 const double rlog2 = 1.0/std::log(2.0);
565 return log(arg) * VecType((typename VecType::float_type)rlog2);
566 }
567
568 template <typename VecType>
vec_log10(VecType arg)569 always_inline VecType vec_log10(VecType arg)
570 {
571 const double rlog10 = 1.0/std::log(10.0);
572 return log(arg) * VecType((typename VecType::float_type)rlog10);
573 }
574
575 template <typename VecType>
vec_select(VecType lhs,VecType rhs,VecType bitmask)576 always_inline VecType vec_select(VecType lhs, VecType rhs, VecType bitmask)
577 {
578 const VecType result = andnot(bitmask, lhs) | (bitmask & rhs);
579 return result;
580 }
581
582
583 template <typename VecType>
vec_undenormalize(VecType arg)584 always_inline VecType vec_undenormalize(VecType arg)
585 {
586 typedef typename VecType::float_type float_type;
587 const float_type min_positive_value = std::numeric_limits<float_type>::min();
588
589 const VecType abs_arg = abs(arg);
590 const VecType abs_arg_lt_min = mask_lt(abs_arg, min_positive_value);
591 const VecType zero = VecType::gen_zero();
592
593 const VecType result = select(arg, zero, abs_arg_lt_min);
594 return result;
595 }
596
597 template <typename VecType>
vec_reciprocal_newton(VecType arg)598 always_inline VecType vec_reciprocal_newton(VecType arg)
599 {
600 const VecType one = VecType::gen_one();
601
602 const VecType approx = fast_reciprocal(arg);
603
604 // One round of Newton-Raphson refinement
605 const VecType diff = one - approx * arg;
606 const VecType result = madd(diff, approx, approx);
607
608 return result;
609 }
610
611
612 }
613 }
614
615 #undef always_inline
616
617 #endif /* NOVA_SIMD_DETAIL_VECMATH_HPP */
618