1 /*
2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License
4 * as published by the Free Software Foundation; either version 2
5 * of the License, or (at your option) any later version.
6 *
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
11 *
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software Foundation,
14 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
15 *
16 * The Original Code is Copyright (C) 2001-2002 by NaN Holding BV.
17 * All rights reserved.
18 *
19 * The Original Code is: some of this file.
20 *
21 * */
22
23 /** \file
24 * \ingroup bli
25 */
26
27 #ifndef __MATH_BASE_INLINE_C__
28 #define __MATH_BASE_INLINE_C__
29
30 #include <float.h>
31 #include <limits.h>
32 #include <stdio.h>
33 #include <stdlib.h>
34
35 #ifdef __SSE2__
36 # include <emmintrin.h>
37 #endif
38
39 #include "BLI_math_base.h"
40
41 #ifdef __cplusplus
42 extern "C" {
43 #endif
44
45 /* copied from BLI_utildefines.h */
46 #ifdef __GNUC__
47 # define UNLIKELY(x) __builtin_expect(!!(x), 0)
48 #else
49 # define UNLIKELY(x) (x)
50 #endif
51
52 /* powf is really slow for raising to integer powers. */
pow2f(float x)53 MINLINE float pow2f(float x)
54 {
55 return x * x;
56 }
pow3f(float x)57 MINLINE float pow3f(float x)
58 {
59 return pow2f(x) * x;
60 }
pow4f(float x)61 MINLINE float pow4f(float x)
62 {
63 return pow2f(pow2f(x));
64 }
pow5f(float x)65 MINLINE float pow5f(float x)
66 {
67 return pow4f(x) * x;
68 }
pow7f(float x)69 MINLINE float pow7f(float x)
70 {
71 return pow2f(pow3f(x)) * x;
72 }
73
sqrt3f(float f)74 MINLINE float sqrt3f(float f)
75 {
76 if (UNLIKELY(f == 0.0f)) {
77 return 0.0f;
78 }
79 else if (UNLIKELY(f < 0.0f)) {
80 return -(float)(exp(log(-f) / 3.0));
81 }
82 else {
83 return (float)(exp(log(f) / 3.0));
84 }
85 }
86
sqrt3d(double d)87 MINLINE double sqrt3d(double d)
88 {
89 if (UNLIKELY(d == 0.0)) {
90 return 0.0;
91 }
92 else if (UNLIKELY(d < 0.0)) {
93 return -exp(log(-d) / 3.0);
94 }
95 else {
96 return exp(log(d) / 3.0);
97 }
98 }
99
sqrtf_signed(float f)100 MINLINE float sqrtf_signed(float f)
101 {
102 return (f >= 0.0f) ? sqrtf(f) : -sqrtf(-f);
103 }
104
saacos(float fac)105 MINLINE float saacos(float fac)
106 {
107 if (UNLIKELY(fac <= -1.0f)) {
108 return (float)M_PI;
109 }
110 else if (UNLIKELY(fac >= 1.0f)) {
111 return 0.0f;
112 }
113 else {
114 return acosf(fac);
115 }
116 }
117
saasin(float fac)118 MINLINE float saasin(float fac)
119 {
120 if (UNLIKELY(fac <= -1.0f)) {
121 return (float)-M_PI / 2.0f;
122 }
123 else if (UNLIKELY(fac >= 1.0f)) {
124 return (float)M_PI / 2.0f;
125 }
126 else {
127 return asinf(fac);
128 }
129 }
130
sasqrt(float fac)131 MINLINE float sasqrt(float fac)
132 {
133 if (UNLIKELY(fac <= 0.0f)) {
134 return 0.0f;
135 }
136 else {
137 return sqrtf(fac);
138 }
139 }
140
saacosf(float fac)141 MINLINE float saacosf(float fac)
142 {
143 if (UNLIKELY(fac <= -1.0f)) {
144 return (float)M_PI;
145 }
146 else if (UNLIKELY(fac >= 1.0f)) {
147 return 0.0f;
148 }
149 else {
150 return acosf(fac);
151 }
152 }
153
saasinf(float fac)154 MINLINE float saasinf(float fac)
155 {
156 if (UNLIKELY(fac <= -1.0f)) {
157 return (float)-M_PI / 2.0f;
158 }
159 else if (UNLIKELY(fac >= 1.0f)) {
160 return (float)M_PI / 2.0f;
161 }
162 else {
163 return asinf(fac);
164 }
165 }
166
sasqrtf(float fac)167 MINLINE float sasqrtf(float fac)
168 {
169 if (UNLIKELY(fac <= 0.0f)) {
170 return 0.0f;
171 }
172 else {
173 return sqrtf(fac);
174 }
175 }
176
interpf(float target,float origin,float fac)177 MINLINE float interpf(float target, float origin, float fac)
178 {
179 return (fac * target) + (1.0f - fac) * origin;
180 }
181
interpd(double target,double origin,double fac)182 MINLINE double interpd(double target, double origin, double fac)
183 {
184 return (fac * target) + (1.0f - fac) * origin;
185 }
186
187 /* used for zoom values*/
power_of_2(float val)188 MINLINE float power_of_2(float val)
189 {
190 return (float)pow(2.0, ceil(log((double)val) / M_LN2));
191 }
192
is_power_of_2_i(int n)193 MINLINE int is_power_of_2_i(int n)
194 {
195 return (n & (n - 1)) == 0;
196 }
197
power_of_2_max_i(int n)198 MINLINE int power_of_2_max_i(int n)
199 {
200 if (is_power_of_2_i(n)) {
201 return n;
202 }
203
204 do {
205 n = n & (n - 1);
206 } while (!is_power_of_2_i(n));
207
208 return n * 2;
209 }
210
power_of_2_min_i(int n)211 MINLINE int power_of_2_min_i(int n)
212 {
213 while (!is_power_of_2_i(n)) {
214 n = n & (n - 1);
215 }
216
217 return n;
218 }
219
power_of_2_max_u(unsigned int x)220 MINLINE unsigned int power_of_2_max_u(unsigned int x)
221 {
222 x -= 1;
223 x |= (x >> 1);
224 x |= (x >> 2);
225 x |= (x >> 4);
226 x |= (x >> 8);
227 x |= (x >> 16);
228 return x + 1;
229 }
230
power_of_2_min_u(unsigned int x)231 MINLINE unsigned int power_of_2_min_u(unsigned int x)
232 {
233 x |= (x >> 1);
234 x |= (x >> 2);
235 x |= (x >> 4);
236 x |= (x >> 8);
237 x |= (x >> 16);
238 return x - (x >> 1);
239 }
240
log2_floor_u(unsigned int x)241 MINLINE unsigned int log2_floor_u(unsigned int x)
242 {
243 return x <= 1 ? 0 : 1 + log2_floor_u(x >> 1);
244 }
245
log2_ceil_u(unsigned int x)246 MINLINE unsigned int log2_ceil_u(unsigned int x)
247 {
248 if (is_power_of_2_i((int)x)) {
249 return log2_floor_u(x);
250 }
251 else {
252 return log2_floor_u(x) + 1;
253 }
254 }
255
256 /* rounding and clamping */
257
258 #define _round_clamp_fl_impl(arg, ty, min, max) \
259 { \
260 float r = floorf(arg + 0.5f); \
261 if (UNLIKELY(r <= (float)min)) { \
262 return (ty)min; \
263 } \
264 else if (UNLIKELY(r >= (float)max)) { \
265 return (ty)max; \
266 } \
267 else { \
268 return (ty)r; \
269 } \
270 }
271
272 #define _round_clamp_db_impl(arg, ty, min, max) \
273 { \
274 double r = floor(arg + 0.5); \
275 if (UNLIKELY(r <= (double)min)) { \
276 return (ty)min; \
277 } \
278 else if (UNLIKELY(r >= (double)max)) { \
279 return (ty)max; \
280 } \
281 else { \
282 return (ty)r; \
283 } \
284 }
285
286 #define _round_fl_impl(arg, ty) \
287 { \
288 return (ty)floorf(arg + 0.5f); \
289 }
290 #define _round_db_impl(arg, ty) \
291 { \
292 return (ty)floor(arg + 0.5); \
293 }
294
round_fl_to_char(float a)295 MINLINE signed char round_fl_to_char(float a){_round_fl_impl(a, signed char)} MINLINE
round_fl_to_uchar(float a)296 unsigned char round_fl_to_uchar(float a){_round_fl_impl(a, unsigned char)} MINLINE
round_fl_to_short(float a)297 short round_fl_to_short(float a){_round_fl_impl(a, short)} MINLINE
round_fl_to_ushort(float a)298 unsigned short round_fl_to_ushort(float a){_round_fl_impl(a, unsigned short)} MINLINE
round_fl_to_int(float a)299 int round_fl_to_int(float a){_round_fl_impl(a, int)} MINLINE
round_fl_to_uint(float a)300 unsigned int round_fl_to_uint(float a){_round_fl_impl(a, unsigned int)}
301
round_db_to_char(double a)302 MINLINE signed char round_db_to_char(double a){_round_db_impl(a, signed char)} MINLINE
round_db_to_uchar(double a)303 unsigned char round_db_to_uchar(double a){_round_db_impl(a, unsigned char)} MINLINE
round_db_to_short(double a)304 short round_db_to_short(double a){_round_db_impl(a, short)} MINLINE
round_db_to_ushort(double a)305 unsigned short round_db_to_ushort(double a){_round_db_impl(a, unsigned short)} MINLINE
round_db_to_int(double a)306 int round_db_to_int(double a){_round_db_impl(a, int)} MINLINE
round_db_to_uint(double a)307 unsigned int round_db_to_uint(double a)
308 {
309 _round_db_impl(a, unsigned int)
310 }
311
312 #undef _round_fl_impl
313 #undef _round_db_impl
314
round_fl_to_char_clamp(float a)315 MINLINE signed char round_fl_to_char_clamp(float a){
316 _round_clamp_fl_impl(a, signed char, SCHAR_MIN, SCHAR_MAX)} MINLINE
round_fl_to_uchar_clamp(float a)317 unsigned char round_fl_to_uchar_clamp(float a){
318 _round_clamp_fl_impl(a, unsigned char, 0, UCHAR_MAX)} MINLINE
round_fl_to_short_clamp(float a)319 short round_fl_to_short_clamp(float a){
320 _round_clamp_fl_impl(a, short, SHRT_MIN, SHRT_MAX)} MINLINE
round_fl_to_ushort_clamp(float a)321 unsigned short round_fl_to_ushort_clamp(float a){
322 _round_clamp_fl_impl(a, unsigned short, 0, USHRT_MAX)} MINLINE
round_fl_to_int_clamp(float a)323 int round_fl_to_int_clamp(float a){_round_clamp_fl_impl(a, int, INT_MIN, INT_MAX)} MINLINE
round_fl_to_uint_clamp(float a)324 unsigned int round_fl_to_uint_clamp(float a){
325 _round_clamp_fl_impl(a, unsigned int, 0, UINT_MAX)}
326
round_db_to_char_clamp(double a)327 MINLINE signed char round_db_to_char_clamp(double a){
328 _round_clamp_db_impl(a, signed char, SCHAR_MIN, SCHAR_MAX)} MINLINE
round_db_to_uchar_clamp(double a)329 unsigned char round_db_to_uchar_clamp(double a){
330 _round_clamp_db_impl(a, unsigned char, 0, UCHAR_MAX)} MINLINE
round_db_to_short_clamp(double a)331 short round_db_to_short_clamp(double a){
332 _round_clamp_db_impl(a, short, SHRT_MIN, SHRT_MAX)} MINLINE
round_db_to_ushort_clamp(double a)333 unsigned short round_db_to_ushort_clamp(double a){
334 _round_clamp_db_impl(a, unsigned short, 0, USHRT_MAX)} MINLINE
round_db_to_int_clamp(double a)335 int round_db_to_int_clamp(double a){_round_clamp_db_impl(a, int, INT_MIN, INT_MAX)} MINLINE
round_db_to_uint_clamp(double a)336 unsigned int round_db_to_uint_clamp(double a)
337 {
338 _round_clamp_db_impl(a, unsigned int, 0, UINT_MAX)
339 }
340
341 #undef _round_clamp_fl_impl
342 #undef _round_clamp_db_impl
343
344 /* integer division that rounds 0.5 up, particularly useful for color blending
345 * with integers, to avoid gradual darkening when rounding down */
divide_round_i(int a,int b)346 MINLINE int divide_round_i(int a, int b)
347 {
348 return (2 * a + b) / (2 * b);
349 }
350
351 /**
352 * Integer division that floors negative result.
353 * \note This works like Python's int division.
354 */
divide_floor_i(int a,int b)355 MINLINE int divide_floor_i(int a, int b)
356 {
357 int d = a / b;
358 int r = a % b; /* Optimizes into a single division. */
359 return r ? d - ((a < 0) ^ (b < 0)) : d;
360 }
361
362 /**
363 * Integer division that returns the ceiling, instead of flooring like normal C division.
364 */
divide_ceil_u(uint a,uint b)365 MINLINE uint divide_ceil_u(uint a, uint b)
366 {
367 return (a + b - 1) / b;
368 }
369
370 /**
371 * modulo that handles negative numbers, works the same as Python's.
372 */
mod_i(int i,int n)373 MINLINE int mod_i(int i, int n)
374 {
375 return (i % n + n) % n;
376 }
377
fractf(float a)378 MINLINE float fractf(float a)
379 {
380 return a - floorf(a);
381 }
382
383 /* Adapted from godotengine math_funcs.h. */
wrapf(float value,float max,float min)384 MINLINE float wrapf(float value, float max, float min)
385 {
386 float range = max - min;
387 return (range != 0.0f) ? value - (range * floorf((value - min) / range)) : min;
388 }
389
390 // Square.
391
square_s(short a)392 MINLINE int square_s(short a)
393 {
394 return a * a;
395 }
396
square_i(int a)397 MINLINE int square_i(int a)
398 {
399 return a * a;
400 }
401
square_uint(unsigned int a)402 MINLINE unsigned int square_uint(unsigned int a)
403 {
404 return a * a;
405 }
406
square_uchar(unsigned char a)407 MINLINE int square_uchar(unsigned char a)
408 {
409 return a * a;
410 }
411
square_f(float a)412 MINLINE float square_f(float a)
413 {
414 return a * a;
415 }
416
square_d(double a)417 MINLINE double square_d(double a)
418 {
419 return a * a;
420 }
421
422 // Cube.
423
cube_s(short a)424 MINLINE int cube_s(short a)
425 {
426 return a * a * a;
427 }
428
cube_i(int a)429 MINLINE int cube_i(int a)
430 {
431 return a * a * a;
432 }
433
cube_uint(unsigned int a)434 MINLINE unsigned int cube_uint(unsigned int a)
435 {
436 return a * a * a;
437 }
438
cube_uchar(unsigned char a)439 MINLINE int cube_uchar(unsigned char a)
440 {
441 return a * a * a;
442 }
443
cube_f(float a)444 MINLINE float cube_f(float a)
445 {
446 return a * a * a;
447 }
448
cube_d(double a)449 MINLINE double cube_d(double a)
450 {
451 return a * a * a;
452 }
453
454 // Min/max
455
min_ff(float a,float b)456 MINLINE float min_ff(float a, float b)
457 {
458 return (a < b) ? a : b;
459 }
max_ff(float a,float b)460 MINLINE float max_ff(float a, float b)
461 {
462 return (a > b) ? a : b;
463 }
464 /* See: https://www.iquilezles.org/www/articles/smin/smin.htm. */
smoothminf(float a,float b,float c)465 MINLINE float smoothminf(float a, float b, float c)
466 {
467 if (c != 0.0f) {
468 float h = max_ff(c - fabsf(a - b), 0.0f) / c;
469 return min_ff(a, b) - h * h * h * c * (1.0f / 6.0f);
470 }
471 else {
472 return min_ff(a, b);
473 }
474 }
475
min_dd(double a,double b)476 MINLINE double min_dd(double a, double b)
477 {
478 return (a < b) ? a : b;
479 }
max_dd(double a,double b)480 MINLINE double max_dd(double a, double b)
481 {
482 return (a > b) ? a : b;
483 }
484
min_ii(int a,int b)485 MINLINE int min_ii(int a, int b)
486 {
487 return (a < b) ? a : b;
488 }
max_ii(int a,int b)489 MINLINE int max_ii(int a, int b)
490 {
491 return (b < a) ? a : b;
492 }
493
min_fff(float a,float b,float c)494 MINLINE float min_fff(float a, float b, float c)
495 {
496 return min_ff(min_ff(a, b), c);
497 }
max_fff(float a,float b,float c)498 MINLINE float max_fff(float a, float b, float c)
499 {
500 return max_ff(max_ff(a, b), c);
501 }
502
min_iii(int a,int b,int c)503 MINLINE int min_iii(int a, int b, int c)
504 {
505 return min_ii(min_ii(a, b), c);
506 }
max_iii(int a,int b,int c)507 MINLINE int max_iii(int a, int b, int c)
508 {
509 return max_ii(max_ii(a, b), c);
510 }
511
min_ffff(float a,float b,float c,float d)512 MINLINE float min_ffff(float a, float b, float c, float d)
513 {
514 return min_ff(min_fff(a, b, c), d);
515 }
max_ffff(float a,float b,float c,float d)516 MINLINE float max_ffff(float a, float b, float c, float d)
517 {
518 return max_ff(max_fff(a, b, c), d);
519 }
520
min_iiii(int a,int b,int c,int d)521 MINLINE int min_iiii(int a, int b, int c, int d)
522 {
523 return min_ii(min_iii(a, b, c), d);
524 }
max_iiii(int a,int b,int c,int d)525 MINLINE int max_iiii(int a, int b, int c, int d)
526 {
527 return max_ii(max_iii(a, b, c), d);
528 }
529
min_zz(size_t a,size_t b)530 MINLINE size_t min_zz(size_t a, size_t b)
531 {
532 return (a < b) ? a : b;
533 }
max_zz(size_t a,size_t b)534 MINLINE size_t max_zz(size_t a, size_t b)
535 {
536 return (b < a) ? a : b;
537 }
538
min_cc(char a,char b)539 MINLINE char min_cc(char a, char b)
540 {
541 return (a < b) ? a : b;
542 }
max_cc(char a,char b)543 MINLINE char max_cc(char a, char b)
544 {
545 return (b < a) ? a : b;
546 }
547
clamp_i(int value,int min,int max)548 MINLINE int clamp_i(int value, int min, int max)
549 {
550 return min_ii(max_ii(value, min), max);
551 }
552
clamp_f(float value,float min,float max)553 MINLINE float clamp_f(float value, float min, float max)
554 {
555 if (value > max) {
556 return max;
557 }
558 else if (value < min) {
559 return min;
560 }
561 return value;
562 }
563
clamp_z(size_t value,size_t min,size_t max)564 MINLINE size_t clamp_z(size_t value, size_t min, size_t max)
565 {
566 return min_zz(max_zz(value, min), max);
567 }
568
569 /**
570 * Almost-equal for IEEE floats, using absolute difference method.
571 *
572 * \param max_diff: the maximum absolute difference.
573 */
compare_ff(float a,float b,const float max_diff)574 MINLINE int compare_ff(float a, float b, const float max_diff)
575 {
576 return fabsf(a - b) <= max_diff;
577 }
578
579 /**
580 * Almost-equal for IEEE floats, using their integer representation
581 * (mixing ULP and absolute difference methods).
582 *
583 * \param max_diff: is the maximum absolute difference (allows to take care of the near-zero area,
584 * where relative difference methods cannot really work).
585 * \param max_ulps: is the 'maximum number of floats + 1'
586 * allowed between \a a and \a b to consider them equal.
587 *
588 * \see https://randomascii.wordpress.com/2012/02/25/comparing-floating-point-numbers-2012-edition/
589 */
compare_ff_relative(float a,float b,const float max_diff,const int max_ulps)590 MINLINE int compare_ff_relative(float a, float b, const float max_diff, const int max_ulps)
591 {
592 union {
593 float f;
594 int i;
595 } ua, ub;
596
597 BLI_assert(sizeof(float) == sizeof(int));
598 BLI_assert(max_ulps < (1 << 22));
599
600 if (fabsf(a - b) <= max_diff) {
601 return 1;
602 }
603
604 ua.f = a;
605 ub.f = b;
606
607 /* Important to compare sign from integers, since (-0.0f < 0) is false
608 * (though this shall not be an issue in common cases)... */
609 return ((ua.i < 0) != (ub.i < 0)) ? 0 : (abs(ua.i - ub.i) <= max_ulps) ? 1 : 0;
610 }
611
signf(float f)612 MINLINE float signf(float f)
613 {
614 return (f < 0.f) ? -1.f : 1.f;
615 }
616
compatible_signf(float f)617 MINLINE float compatible_signf(float f)
618 {
619 if (f > 0.0f) {
620 return 1.0f;
621 }
622 if (f < 0.0f) {
623 return -1.0f;
624 }
625 else {
626 return 0.0f;
627 }
628 }
629
signum_i_ex(float a,float eps)630 MINLINE int signum_i_ex(float a, float eps)
631 {
632 if (a > eps) {
633 return 1;
634 }
635 if (a < -eps) {
636 return -1;
637 }
638 else {
639 return 0;
640 }
641 }
642
signum_i(float a)643 MINLINE int signum_i(float a)
644 {
645 if (a > 0.0f) {
646 return 1;
647 }
648 if (a < 0.0f) {
649 return -1;
650 }
651 else {
652 return 0;
653 }
654 }
655
656 /**
657 * Returns number of (base ten) *significant* digits of integer part of given float
658 * (negative in case of decimal-only floats, 0.01 returns -1 e.g.).
659 */
integer_digits_f(const float f)660 MINLINE int integer_digits_f(const float f)
661 {
662 return (f == 0.0f) ? 0 : (int)floor(log10(fabs(f))) + 1;
663 }
664
665 /**
666 * Returns number of (base ten) *significant* digits of integer part of given double
667 * (negative in case of decimal-only floats, 0.01 returns -1 e.g.).
668 */
integer_digits_d(const double d)669 MINLINE int integer_digits_d(const double d)
670 {
671 return (d == 0.0) ? 0 : (int)floor(log10(fabs(d))) + 1;
672 }
673
integer_digits_i(const int i)674 MINLINE int integer_digits_i(const int i)
675 {
676 return (int)log10((double)i) + 1;
677 }
678
679 /* Internal helpers for SSE2 implementation.
680 *
681 * NOTE: Are to be called ONLY from inside `#ifdef __SSE2__` !!!
682 */
683
684 #ifdef __SSE2__
685
686 /* Calculate initial guess for arg^exp based on float representation
687 * This method gives a constant bias, which can be easily compensated by
688 * multiplying with bias_coeff.
689 * Gives better results for exponents near 1 (e. g. 4/5).
690 * exp = exponent, encoded as uint32_t
691 * e2coeff = 2^(127/exponent - 127) * bias_coeff^(1/exponent), encoded as
692 * uint32_t
693 *
694 * We hope that exp and e2coeff gets properly inlined
695 */
_bli_math_fastpow(const int exp,const int e2coeff,const __m128 arg)696 MALWAYS_INLINE __m128 _bli_math_fastpow(const int exp, const int e2coeff, const __m128 arg)
697 {
698 __m128 ret;
699 ret = _mm_mul_ps(arg, _mm_castsi128_ps(_mm_set1_epi32(e2coeff)));
700 ret = _mm_cvtepi32_ps(_mm_castps_si128(ret));
701 ret = _mm_mul_ps(ret, _mm_castsi128_ps(_mm_set1_epi32(exp)));
702 ret = _mm_castsi128_ps(_mm_cvtps_epi32(ret));
703 return ret;
704 }
705
706 /* Improve x ^ 1.0f/5.0f solution with Newton-Raphson method */
_bli_math_improve_5throot_solution(const __m128 old_result,const __m128 x)707 MALWAYS_INLINE __m128 _bli_math_improve_5throot_solution(const __m128 old_result, const __m128 x)
708 {
709 __m128 approx2 = _mm_mul_ps(old_result, old_result);
710 __m128 approx4 = _mm_mul_ps(approx2, approx2);
711 __m128 t = _mm_div_ps(x, approx4);
712 __m128 summ = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(4.0f), old_result), t); /* fma */
713 return _mm_mul_ps(summ, _mm_set1_ps(1.0f / 5.0f));
714 }
715
716 /* Calculate powf(x, 2.4). Working domain: 1e-10 < x < 1e+10 */
_bli_math_fastpow24(const __m128 arg)717 MALWAYS_INLINE __m128 _bli_math_fastpow24(const __m128 arg)
718 {
719 /* max, avg and |avg| errors were calculated in gcc without FMA instructions
720 * The final precision should be better than powf in glibc */
721
722 /* Calculate x^4/5, coefficient 0.994 was constructed manually to minimize
723 * avg error.
724 */
725 /* 0x3F4CCCCD = 4/5 */
726 /* 0x4F55A7FB = 2^(127/(4/5) - 127) * 0.994^(1/(4/5)) */
727 /* error max = 0.17, avg = 0.0018, |avg| = 0.05 */
728 __m128 x = _bli_math_fastpow(0x3F4CCCCD, 0x4F55A7FB, arg);
729 __m128 arg2 = _mm_mul_ps(arg, arg);
730 __m128 arg4 = _mm_mul_ps(arg2, arg2);
731 /* error max = 0.018 avg = 0.0031 |avg| = 0.0031 */
732 x = _bli_math_improve_5throot_solution(x, arg4);
733 /* error max = 0.00021 avg = 1.6e-05 |avg| = 1.6e-05 */
734 x = _bli_math_improve_5throot_solution(x, arg4);
735 /* error max = 6.1e-07 avg = 5.2e-08 |avg| = 1.1e-07 */
736 x = _bli_math_improve_5throot_solution(x, arg4);
737 return _mm_mul_ps(x, _mm_mul_ps(x, x));
738 }
739
740 /* Calculate powf(x, 1.0f / 2.4) */
_bli_math_fastpow512(const __m128 arg)741 MALWAYS_INLINE __m128 _bli_math_fastpow512(const __m128 arg)
742 {
743 /* 5/12 is too small, so compute the 4th root of 20/12 instead.
744 * 20/12 = 5/3 = 1 + 2/3 = 2 - 1/3. 2/3 is a suitable argument for fastpow.
745 * weighting coefficient: a^-1/2 = 2 a; a = 2^-2/3
746 */
747 __m128 xf = _bli_math_fastpow(0x3f2aaaab, 0x5eb504f3, arg);
748 __m128 xover = _mm_mul_ps(arg, xf);
749 __m128 xfm1 = _mm_rsqrt_ps(xf);
750 __m128 x2 = _mm_mul_ps(arg, arg);
751 __m128 xunder = _mm_mul_ps(x2, xfm1);
752 /* sqrt2 * over + 2 * sqrt2 * under */
753 __m128 xavg = _mm_mul_ps(_mm_set1_ps(1.0f / (3.0f * 0.629960524947437f) * 0.999852f),
754 _mm_add_ps(xover, xunder));
755 xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg));
756 xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg));
757 return xavg;
758 }
759
_bli_math_blend_sse(const __m128 mask,const __m128 a,const __m128 b)760 MALWAYS_INLINE __m128 _bli_math_blend_sse(const __m128 mask, const __m128 a, const __m128 b)
761 {
762 return _mm_or_ps(_mm_and_ps(mask, a), _mm_andnot_ps(mask, b));
763 }
764
765 #endif /* __SSE2__ */
766
767 /* Low level conversion functions */
unit_float_to_uchar_clamp(float val)768 MINLINE unsigned char unit_float_to_uchar_clamp(float val)
769 {
770 return (unsigned char)((
771 (val <= 0.0f) ? 0 : ((val > (1.0f - 0.5f / 255.0f)) ? 255 : ((255.0f * val) + 0.5f))));
772 }
773 #define unit_float_to_uchar_clamp(val) \
774 ((CHECK_TYPE_INLINE(val, float)), unit_float_to_uchar_clamp(val))
775
unit_float_to_ushort_clamp(float val)776 MINLINE unsigned short unit_float_to_ushort_clamp(float val)
777 {
778 return (unsigned short)((val >= 1.0f - 0.5f / 65535) ?
779 65535 :
780 (val <= 0.0f) ? 0 : (val * 65535.0f + 0.5f));
781 }
782 #define unit_float_to_ushort_clamp(val) \
783 ((CHECK_TYPE_INLINE(val, float)), unit_float_to_ushort_clamp(val))
784
unit_ushort_to_uchar(unsigned short val)785 MINLINE unsigned char unit_ushort_to_uchar(unsigned short val)
786 {
787 return (unsigned char)(((val) >= 65535 - 128) ? 255 : ((val) + 128) >> 8);
788 }
789 #define unit_ushort_to_uchar(val) \
790 ((CHECK_TYPE_INLINE(val, unsigned short)), unit_ushort_to_uchar(val))
791
792 #define unit_float_to_uchar_clamp_v3(v1, v2) \
793 { \
794 (v1)[0] = unit_float_to_uchar_clamp((v2[0])); \
795 (v1)[1] = unit_float_to_uchar_clamp((v2[1])); \
796 (v1)[2] = unit_float_to_uchar_clamp((v2[2])); \
797 } \
798 ((void)0)
799 #define unit_float_to_uchar_clamp_v4(v1, v2) \
800 { \
801 (v1)[0] = unit_float_to_uchar_clamp((v2[0])); \
802 (v1)[1] = unit_float_to_uchar_clamp((v2[1])); \
803 (v1)[2] = unit_float_to_uchar_clamp((v2[2])); \
804 (v1)[3] = unit_float_to_uchar_clamp((v2[3])); \
805 } \
806 ((void)0)
807
808 #ifdef __cplusplus
809 }
810 #endif
811
812 #endif /* __MATH_BASE_INLINE_C__ */
813