1 /*
2  * This program is free software; you can redistribute it and/or
3  * modify it under the terms of the GNU General Public License
4  * as published by the Free Software Foundation; either version 2
5  * of the License, or (at your option) any later version.
6  *
7  * This program is distributed in the hope that it will be useful,
8  * but WITHOUT ANY WARRANTY; without even the implied warranty of
9  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10  * GNU General Public License for more details.
11  *
12  * You should have received a copy of the GNU General Public License
13  * along with this program; if not, write to the Free Software Foundation,
14  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
15  *
16  * The Original Code is Copyright (C) 2001-2002 by NaN Holding BV.
17  * All rights reserved.
18  *
19  * The Original Code is: some of this file.
20  *
21  * */
22 
23 /** \file
24  * \ingroup bli
25  */
26 
27 #ifndef __MATH_BASE_INLINE_C__
28 #define __MATH_BASE_INLINE_C__
29 
30 #include <float.h>
31 #include <limits.h>
32 #include <stdio.h>
33 #include <stdlib.h>
34 
35 #ifdef __SSE2__
36 #  include <emmintrin.h>
37 #endif
38 
39 #include "BLI_math_base.h"
40 
41 #ifdef __cplusplus
42 extern "C" {
43 #endif
44 
45 /* copied from BLI_utildefines.h */
46 #ifdef __GNUC__
47 #  define UNLIKELY(x) __builtin_expect(!!(x), 0)
48 #else
49 #  define UNLIKELY(x) (x)
50 #endif
51 
52 /* powf is really slow for raising to integer powers. */
pow2f(float x)53 MINLINE float pow2f(float x)
54 {
55   return x * x;
56 }
pow3f(float x)57 MINLINE float pow3f(float x)
58 {
59   return pow2f(x) * x;
60 }
pow4f(float x)61 MINLINE float pow4f(float x)
62 {
63   return pow2f(pow2f(x));
64 }
pow5f(float x)65 MINLINE float pow5f(float x)
66 {
67   return pow4f(x) * x;
68 }
pow7f(float x)69 MINLINE float pow7f(float x)
70 {
71   return pow2f(pow3f(x)) * x;
72 }
73 
sqrt3f(float f)74 MINLINE float sqrt3f(float f)
75 {
76   if (UNLIKELY(f == 0.0f)) {
77     return 0.0f;
78   }
79   else if (UNLIKELY(f < 0.0f)) {
80     return -(float)(exp(log(-f) / 3.0));
81   }
82   else {
83     return (float)(exp(log(f) / 3.0));
84   }
85 }
86 
sqrt3d(double d)87 MINLINE double sqrt3d(double d)
88 {
89   if (UNLIKELY(d == 0.0)) {
90     return 0.0;
91   }
92   else if (UNLIKELY(d < 0.0)) {
93     return -exp(log(-d) / 3.0);
94   }
95   else {
96     return exp(log(d) / 3.0);
97   }
98 }
99 
sqrtf_signed(float f)100 MINLINE float sqrtf_signed(float f)
101 {
102   return (f >= 0.0f) ? sqrtf(f) : -sqrtf(-f);
103 }
104 
saacos(float fac)105 MINLINE float saacos(float fac)
106 {
107   if (UNLIKELY(fac <= -1.0f)) {
108     return (float)M_PI;
109   }
110   else if (UNLIKELY(fac >= 1.0f)) {
111     return 0.0f;
112   }
113   else {
114     return acosf(fac);
115   }
116 }
117 
saasin(float fac)118 MINLINE float saasin(float fac)
119 {
120   if (UNLIKELY(fac <= -1.0f)) {
121     return (float)-M_PI / 2.0f;
122   }
123   else if (UNLIKELY(fac >= 1.0f)) {
124     return (float)M_PI / 2.0f;
125   }
126   else {
127     return asinf(fac);
128   }
129 }
130 
sasqrt(float fac)131 MINLINE float sasqrt(float fac)
132 {
133   if (UNLIKELY(fac <= 0.0f)) {
134     return 0.0f;
135   }
136   else {
137     return sqrtf(fac);
138   }
139 }
140 
saacosf(float fac)141 MINLINE float saacosf(float fac)
142 {
143   if (UNLIKELY(fac <= -1.0f)) {
144     return (float)M_PI;
145   }
146   else if (UNLIKELY(fac >= 1.0f)) {
147     return 0.0f;
148   }
149   else {
150     return acosf(fac);
151   }
152 }
153 
saasinf(float fac)154 MINLINE float saasinf(float fac)
155 {
156   if (UNLIKELY(fac <= -1.0f)) {
157     return (float)-M_PI / 2.0f;
158   }
159   else if (UNLIKELY(fac >= 1.0f)) {
160     return (float)M_PI / 2.0f;
161   }
162   else {
163     return asinf(fac);
164   }
165 }
166 
sasqrtf(float fac)167 MINLINE float sasqrtf(float fac)
168 {
169   if (UNLIKELY(fac <= 0.0f)) {
170     return 0.0f;
171   }
172   else {
173     return sqrtf(fac);
174   }
175 }
176 
interpf(float target,float origin,float fac)177 MINLINE float interpf(float target, float origin, float fac)
178 {
179   return (fac * target) + (1.0f - fac) * origin;
180 }
181 
interpd(double target,double origin,double fac)182 MINLINE double interpd(double target, double origin, double fac)
183 {
184   return (fac * target) + (1.0f - fac) * origin;
185 }
186 
187 /* used for zoom values*/
power_of_2(float val)188 MINLINE float power_of_2(float val)
189 {
190   return (float)pow(2.0, ceil(log((double)val) / M_LN2));
191 }
192 
is_power_of_2_i(int n)193 MINLINE int is_power_of_2_i(int n)
194 {
195   return (n & (n - 1)) == 0;
196 }
197 
power_of_2_max_i(int n)198 MINLINE int power_of_2_max_i(int n)
199 {
200   if (is_power_of_2_i(n)) {
201     return n;
202   }
203 
204   do {
205     n = n & (n - 1);
206   } while (!is_power_of_2_i(n));
207 
208   return n * 2;
209 }
210 
power_of_2_min_i(int n)211 MINLINE int power_of_2_min_i(int n)
212 {
213   while (!is_power_of_2_i(n)) {
214     n = n & (n - 1);
215   }
216 
217   return n;
218 }
219 
power_of_2_max_u(unsigned int x)220 MINLINE unsigned int power_of_2_max_u(unsigned int x)
221 {
222   x -= 1;
223   x |= (x >> 1);
224   x |= (x >> 2);
225   x |= (x >> 4);
226   x |= (x >> 8);
227   x |= (x >> 16);
228   return x + 1;
229 }
230 
power_of_2_min_u(unsigned int x)231 MINLINE unsigned int power_of_2_min_u(unsigned int x)
232 {
233   x |= (x >> 1);
234   x |= (x >> 2);
235   x |= (x >> 4);
236   x |= (x >> 8);
237   x |= (x >> 16);
238   return x - (x >> 1);
239 }
240 
log2_floor_u(unsigned int x)241 MINLINE unsigned int log2_floor_u(unsigned int x)
242 {
243   return x <= 1 ? 0 : 1 + log2_floor_u(x >> 1);
244 }
245 
log2_ceil_u(unsigned int x)246 MINLINE unsigned int log2_ceil_u(unsigned int x)
247 {
248   if (is_power_of_2_i((int)x)) {
249     return log2_floor_u(x);
250   }
251   else {
252     return log2_floor_u(x) + 1;
253   }
254 }
255 
256 /* rounding and clamping */
257 
258 #define _round_clamp_fl_impl(arg, ty, min, max) \
259   { \
260     float r = floorf(arg + 0.5f); \
261     if (UNLIKELY(r <= (float)min)) { \
262       return (ty)min; \
263     } \
264     else if (UNLIKELY(r >= (float)max)) { \
265       return (ty)max; \
266     } \
267     else { \
268       return (ty)r; \
269     } \
270   }
271 
272 #define _round_clamp_db_impl(arg, ty, min, max) \
273   { \
274     double r = floor(arg + 0.5); \
275     if (UNLIKELY(r <= (double)min)) { \
276       return (ty)min; \
277     } \
278     else if (UNLIKELY(r >= (double)max)) { \
279       return (ty)max; \
280     } \
281     else { \
282       return (ty)r; \
283     } \
284   }
285 
286 #define _round_fl_impl(arg, ty) \
287   { \
288     return (ty)floorf(arg + 0.5f); \
289   }
290 #define _round_db_impl(arg, ty) \
291   { \
292     return (ty)floor(arg + 0.5); \
293   }
294 
round_fl_to_char(float a)295 MINLINE signed char round_fl_to_char(float a){_round_fl_impl(a, signed char)} MINLINE
round_fl_to_uchar(float a)296     unsigned char round_fl_to_uchar(float a){_round_fl_impl(a, unsigned char)} MINLINE
round_fl_to_short(float a)297     short round_fl_to_short(float a){_round_fl_impl(a, short)} MINLINE
round_fl_to_ushort(float a)298     unsigned short round_fl_to_ushort(float a){_round_fl_impl(a, unsigned short)} MINLINE
round_fl_to_int(float a)299     int round_fl_to_int(float a){_round_fl_impl(a, int)} MINLINE
round_fl_to_uint(float a)300     unsigned int round_fl_to_uint(float a){_round_fl_impl(a, unsigned int)}
301 
round_db_to_char(double a)302 MINLINE signed char round_db_to_char(double a){_round_db_impl(a, signed char)} MINLINE
round_db_to_uchar(double a)303     unsigned char round_db_to_uchar(double a){_round_db_impl(a, unsigned char)} MINLINE
round_db_to_short(double a)304     short round_db_to_short(double a){_round_db_impl(a, short)} MINLINE
round_db_to_ushort(double a)305     unsigned short round_db_to_ushort(double a){_round_db_impl(a, unsigned short)} MINLINE
round_db_to_int(double a)306     int round_db_to_int(double a){_round_db_impl(a, int)} MINLINE
round_db_to_uint(double a)307     unsigned int round_db_to_uint(double a)
308 {
309   _round_db_impl(a, unsigned int)
310 }
311 
312 #undef _round_fl_impl
313 #undef _round_db_impl
314 
round_fl_to_char_clamp(float a)315 MINLINE signed char round_fl_to_char_clamp(float a){
316     _round_clamp_fl_impl(a, signed char, SCHAR_MIN, SCHAR_MAX)} MINLINE
round_fl_to_uchar_clamp(float a)317     unsigned char round_fl_to_uchar_clamp(float a){
318         _round_clamp_fl_impl(a, unsigned char, 0, UCHAR_MAX)} MINLINE
round_fl_to_short_clamp(float a)319     short round_fl_to_short_clamp(float a){
320         _round_clamp_fl_impl(a, short, SHRT_MIN, SHRT_MAX)} MINLINE
round_fl_to_ushort_clamp(float a)321     unsigned short round_fl_to_ushort_clamp(float a){
322         _round_clamp_fl_impl(a, unsigned short, 0, USHRT_MAX)} MINLINE
round_fl_to_int_clamp(float a)323     int round_fl_to_int_clamp(float a){_round_clamp_fl_impl(a, int, INT_MIN, INT_MAX)} MINLINE
round_fl_to_uint_clamp(float a)324     unsigned int round_fl_to_uint_clamp(float a){
325         _round_clamp_fl_impl(a, unsigned int, 0, UINT_MAX)}
326 
round_db_to_char_clamp(double a)327 MINLINE signed char round_db_to_char_clamp(double a){
328     _round_clamp_db_impl(a, signed char, SCHAR_MIN, SCHAR_MAX)} MINLINE
round_db_to_uchar_clamp(double a)329     unsigned char round_db_to_uchar_clamp(double a){
330         _round_clamp_db_impl(a, unsigned char, 0, UCHAR_MAX)} MINLINE
round_db_to_short_clamp(double a)331     short round_db_to_short_clamp(double a){
332         _round_clamp_db_impl(a, short, SHRT_MIN, SHRT_MAX)} MINLINE
round_db_to_ushort_clamp(double a)333     unsigned short round_db_to_ushort_clamp(double a){
334         _round_clamp_db_impl(a, unsigned short, 0, USHRT_MAX)} MINLINE
round_db_to_int_clamp(double a)335     int round_db_to_int_clamp(double a){_round_clamp_db_impl(a, int, INT_MIN, INT_MAX)} MINLINE
round_db_to_uint_clamp(double a)336     unsigned int round_db_to_uint_clamp(double a)
337 {
338   _round_clamp_db_impl(a, unsigned int, 0, UINT_MAX)
339 }
340 
341 #undef _round_clamp_fl_impl
342 #undef _round_clamp_db_impl
343 
344 /* integer division that rounds 0.5 up, particularly useful for color blending
345  * with integers, to avoid gradual darkening when rounding down */
divide_round_i(int a,int b)346 MINLINE int divide_round_i(int a, int b)
347 {
348   return (2 * a + b) / (2 * b);
349 }
350 
351 /**
352  * Integer division that floors negative result.
353  * \note This works like Python's int division.
354  */
divide_floor_i(int a,int b)355 MINLINE int divide_floor_i(int a, int b)
356 {
357   int d = a / b;
358   int r = a % b; /* Optimizes into a single division. */
359   return r ? d - ((a < 0) ^ (b < 0)) : d;
360 }
361 
362 /**
363  * Integer division that returns the ceiling, instead of flooring like normal C division.
364  */
divide_ceil_u(uint a,uint b)365 MINLINE uint divide_ceil_u(uint a, uint b)
366 {
367   return (a + b - 1) / b;
368 }
369 
370 /**
371  * modulo that handles negative numbers, works the same as Python's.
372  */
mod_i(int i,int n)373 MINLINE int mod_i(int i, int n)
374 {
375   return (i % n + n) % n;
376 }
377 
fractf(float a)378 MINLINE float fractf(float a)
379 {
380   return a - floorf(a);
381 }
382 
383 /* Adapted from godotengine math_funcs.h. */
wrapf(float value,float max,float min)384 MINLINE float wrapf(float value, float max, float min)
385 {
386   float range = max - min;
387   return (range != 0.0f) ? value - (range * floorf((value - min) / range)) : min;
388 }
389 
390 // Square.
391 
square_s(short a)392 MINLINE int square_s(short a)
393 {
394   return a * a;
395 }
396 
square_i(int a)397 MINLINE int square_i(int a)
398 {
399   return a * a;
400 }
401 
square_uint(unsigned int a)402 MINLINE unsigned int square_uint(unsigned int a)
403 {
404   return a * a;
405 }
406 
square_uchar(unsigned char a)407 MINLINE int square_uchar(unsigned char a)
408 {
409   return a * a;
410 }
411 
square_f(float a)412 MINLINE float square_f(float a)
413 {
414   return a * a;
415 }
416 
square_d(double a)417 MINLINE double square_d(double a)
418 {
419   return a * a;
420 }
421 
422 // Cube.
423 
cube_s(short a)424 MINLINE int cube_s(short a)
425 {
426   return a * a * a;
427 }
428 
cube_i(int a)429 MINLINE int cube_i(int a)
430 {
431   return a * a * a;
432 }
433 
cube_uint(unsigned int a)434 MINLINE unsigned int cube_uint(unsigned int a)
435 {
436   return a * a * a;
437 }
438 
cube_uchar(unsigned char a)439 MINLINE int cube_uchar(unsigned char a)
440 {
441   return a * a * a;
442 }
443 
cube_f(float a)444 MINLINE float cube_f(float a)
445 {
446   return a * a * a;
447 }
448 
cube_d(double a)449 MINLINE double cube_d(double a)
450 {
451   return a * a * a;
452 }
453 
454 // Min/max
455 
min_ff(float a,float b)456 MINLINE float min_ff(float a, float b)
457 {
458   return (a < b) ? a : b;
459 }
max_ff(float a,float b)460 MINLINE float max_ff(float a, float b)
461 {
462   return (a > b) ? a : b;
463 }
464 /* See: https://www.iquilezles.org/www/articles/smin/smin.htm. */
smoothminf(float a,float b,float c)465 MINLINE float smoothminf(float a, float b, float c)
466 {
467   if (c != 0.0f) {
468     float h = max_ff(c - fabsf(a - b), 0.0f) / c;
469     return min_ff(a, b) - h * h * h * c * (1.0f / 6.0f);
470   }
471   else {
472     return min_ff(a, b);
473   }
474 }
475 
min_dd(double a,double b)476 MINLINE double min_dd(double a, double b)
477 {
478   return (a < b) ? a : b;
479 }
max_dd(double a,double b)480 MINLINE double max_dd(double a, double b)
481 {
482   return (a > b) ? a : b;
483 }
484 
min_ii(int a,int b)485 MINLINE int min_ii(int a, int b)
486 {
487   return (a < b) ? a : b;
488 }
max_ii(int a,int b)489 MINLINE int max_ii(int a, int b)
490 {
491   return (b < a) ? a : b;
492 }
493 
min_fff(float a,float b,float c)494 MINLINE float min_fff(float a, float b, float c)
495 {
496   return min_ff(min_ff(a, b), c);
497 }
max_fff(float a,float b,float c)498 MINLINE float max_fff(float a, float b, float c)
499 {
500   return max_ff(max_ff(a, b), c);
501 }
502 
min_iii(int a,int b,int c)503 MINLINE int min_iii(int a, int b, int c)
504 {
505   return min_ii(min_ii(a, b), c);
506 }
max_iii(int a,int b,int c)507 MINLINE int max_iii(int a, int b, int c)
508 {
509   return max_ii(max_ii(a, b), c);
510 }
511 
min_ffff(float a,float b,float c,float d)512 MINLINE float min_ffff(float a, float b, float c, float d)
513 {
514   return min_ff(min_fff(a, b, c), d);
515 }
max_ffff(float a,float b,float c,float d)516 MINLINE float max_ffff(float a, float b, float c, float d)
517 {
518   return max_ff(max_fff(a, b, c), d);
519 }
520 
min_iiii(int a,int b,int c,int d)521 MINLINE int min_iiii(int a, int b, int c, int d)
522 {
523   return min_ii(min_iii(a, b, c), d);
524 }
max_iiii(int a,int b,int c,int d)525 MINLINE int max_iiii(int a, int b, int c, int d)
526 {
527   return max_ii(max_iii(a, b, c), d);
528 }
529 
min_zz(size_t a,size_t b)530 MINLINE size_t min_zz(size_t a, size_t b)
531 {
532   return (a < b) ? a : b;
533 }
max_zz(size_t a,size_t b)534 MINLINE size_t max_zz(size_t a, size_t b)
535 {
536   return (b < a) ? a : b;
537 }
538 
min_cc(char a,char b)539 MINLINE char min_cc(char a, char b)
540 {
541   return (a < b) ? a : b;
542 }
max_cc(char a,char b)543 MINLINE char max_cc(char a, char b)
544 {
545   return (b < a) ? a : b;
546 }
547 
clamp_i(int value,int min,int max)548 MINLINE int clamp_i(int value, int min, int max)
549 {
550   return min_ii(max_ii(value, min), max);
551 }
552 
clamp_f(float value,float min,float max)553 MINLINE float clamp_f(float value, float min, float max)
554 {
555   if (value > max) {
556     return max;
557   }
558   else if (value < min) {
559     return min;
560   }
561   return value;
562 }
563 
clamp_z(size_t value,size_t min,size_t max)564 MINLINE size_t clamp_z(size_t value, size_t min, size_t max)
565 {
566   return min_zz(max_zz(value, min), max);
567 }
568 
569 /**
570  * Almost-equal for IEEE floats, using absolute difference method.
571  *
572  * \param max_diff: the maximum absolute difference.
573  */
compare_ff(float a,float b,const float max_diff)574 MINLINE int compare_ff(float a, float b, const float max_diff)
575 {
576   return fabsf(a - b) <= max_diff;
577 }
578 
579 /**
580  * Almost-equal for IEEE floats, using their integer representation
581  * (mixing ULP and absolute difference methods).
582  *
583  * \param max_diff: is the maximum absolute difference (allows to take care of the near-zero area,
584  * where relative difference methods cannot really work).
585  * \param max_ulps: is the 'maximum number of floats + 1'
586  * allowed between \a a and \a b to consider them equal.
587  *
588  * \see https://randomascii.wordpress.com/2012/02/25/comparing-floating-point-numbers-2012-edition/
589  */
compare_ff_relative(float a,float b,const float max_diff,const int max_ulps)590 MINLINE int compare_ff_relative(float a, float b, const float max_diff, const int max_ulps)
591 {
592   union {
593     float f;
594     int i;
595   } ua, ub;
596 
597   BLI_assert(sizeof(float) == sizeof(int));
598   BLI_assert(max_ulps < (1 << 22));
599 
600   if (fabsf(a - b) <= max_diff) {
601     return 1;
602   }
603 
604   ua.f = a;
605   ub.f = b;
606 
607   /* Important to compare sign from integers, since (-0.0f < 0) is false
608    * (though this shall not be an issue in common cases)... */
609   return ((ua.i < 0) != (ub.i < 0)) ? 0 : (abs(ua.i - ub.i) <= max_ulps) ? 1 : 0;
610 }
611 
signf(float f)612 MINLINE float signf(float f)
613 {
614   return (f < 0.f) ? -1.f : 1.f;
615 }
616 
compatible_signf(float f)617 MINLINE float compatible_signf(float f)
618 {
619   if (f > 0.0f) {
620     return 1.0f;
621   }
622   if (f < 0.0f) {
623     return -1.0f;
624   }
625   else {
626     return 0.0f;
627   }
628 }
629 
signum_i_ex(float a,float eps)630 MINLINE int signum_i_ex(float a, float eps)
631 {
632   if (a > eps) {
633     return 1;
634   }
635   if (a < -eps) {
636     return -1;
637   }
638   else {
639     return 0;
640   }
641 }
642 
signum_i(float a)643 MINLINE int signum_i(float a)
644 {
645   if (a > 0.0f) {
646     return 1;
647   }
648   if (a < 0.0f) {
649     return -1;
650   }
651   else {
652     return 0;
653   }
654 }
655 
656 /**
657  * Returns number of (base ten) *significant* digits of integer part of given float
658  * (negative in case of decimal-only floats, 0.01 returns -1 e.g.).
659  */
integer_digits_f(const float f)660 MINLINE int integer_digits_f(const float f)
661 {
662   return (f == 0.0f) ? 0 : (int)floor(log10(fabs(f))) + 1;
663 }
664 
665 /**
666  * Returns number of (base ten) *significant* digits of integer part of given double
667  * (negative in case of decimal-only floats, 0.01 returns -1 e.g.).
668  */
integer_digits_d(const double d)669 MINLINE int integer_digits_d(const double d)
670 {
671   return (d == 0.0) ? 0 : (int)floor(log10(fabs(d))) + 1;
672 }
673 
integer_digits_i(const int i)674 MINLINE int integer_digits_i(const int i)
675 {
676   return (int)log10((double)i) + 1;
677 }
678 
679 /* Internal helpers for SSE2 implementation.
680  *
681  * NOTE: Are to be called ONLY from inside `#ifdef __SSE2__` !!!
682  */
683 
684 #ifdef __SSE2__
685 
686 /* Calculate initial guess for arg^exp based on float representation
687  * This method gives a constant bias, which can be easily compensated by
688  * multiplying with bias_coeff.
689  * Gives better results for exponents near 1 (e. g. 4/5).
690  * exp = exponent, encoded as uint32_t
691  * e2coeff = 2^(127/exponent - 127) * bias_coeff^(1/exponent), encoded as
692  * uint32_t
693  *
694  * We hope that exp and e2coeff gets properly inlined
695  */
_bli_math_fastpow(const int exp,const int e2coeff,const __m128 arg)696 MALWAYS_INLINE __m128 _bli_math_fastpow(const int exp, const int e2coeff, const __m128 arg)
697 {
698   __m128 ret;
699   ret = _mm_mul_ps(arg, _mm_castsi128_ps(_mm_set1_epi32(e2coeff)));
700   ret = _mm_cvtepi32_ps(_mm_castps_si128(ret));
701   ret = _mm_mul_ps(ret, _mm_castsi128_ps(_mm_set1_epi32(exp)));
702   ret = _mm_castsi128_ps(_mm_cvtps_epi32(ret));
703   return ret;
704 }
705 
706 /* Improve x ^ 1.0f/5.0f solution with Newton-Raphson method */
_bli_math_improve_5throot_solution(const __m128 old_result,const __m128 x)707 MALWAYS_INLINE __m128 _bli_math_improve_5throot_solution(const __m128 old_result, const __m128 x)
708 {
709   __m128 approx2 = _mm_mul_ps(old_result, old_result);
710   __m128 approx4 = _mm_mul_ps(approx2, approx2);
711   __m128 t = _mm_div_ps(x, approx4);
712   __m128 summ = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(4.0f), old_result), t); /* fma */
713   return _mm_mul_ps(summ, _mm_set1_ps(1.0f / 5.0f));
714 }
715 
716 /* Calculate powf(x, 2.4). Working domain: 1e-10 < x < 1e+10 */
_bli_math_fastpow24(const __m128 arg)717 MALWAYS_INLINE __m128 _bli_math_fastpow24(const __m128 arg)
718 {
719   /* max, avg and |avg| errors were calculated in gcc without FMA instructions
720    * The final precision should be better than powf in glibc */
721 
722   /* Calculate x^4/5, coefficient 0.994 was constructed manually to minimize
723    * avg error.
724    */
725   /* 0x3F4CCCCD = 4/5 */
726   /* 0x4F55A7FB = 2^(127/(4/5) - 127) * 0.994^(1/(4/5)) */
727   /* error max = 0.17, avg = 0.0018, |avg| = 0.05 */
728   __m128 x = _bli_math_fastpow(0x3F4CCCCD, 0x4F55A7FB, arg);
729   __m128 arg2 = _mm_mul_ps(arg, arg);
730   __m128 arg4 = _mm_mul_ps(arg2, arg2);
731   /* error max = 0.018        avg = 0.0031    |avg| = 0.0031  */
732   x = _bli_math_improve_5throot_solution(x, arg4);
733   /* error max = 0.00021    avg = 1.6e-05    |avg| = 1.6e-05 */
734   x = _bli_math_improve_5throot_solution(x, arg4);
735   /* error max = 6.1e-07    avg = 5.2e-08    |avg| = 1.1e-07 */
736   x = _bli_math_improve_5throot_solution(x, arg4);
737   return _mm_mul_ps(x, _mm_mul_ps(x, x));
738 }
739 
740 /* Calculate powf(x, 1.0f / 2.4) */
_bli_math_fastpow512(const __m128 arg)741 MALWAYS_INLINE __m128 _bli_math_fastpow512(const __m128 arg)
742 {
743   /* 5/12 is too small, so compute the 4th root of 20/12 instead.
744    * 20/12 = 5/3 = 1 + 2/3 = 2 - 1/3. 2/3 is a suitable argument for fastpow.
745    * weighting coefficient: a^-1/2 = 2 a; a = 2^-2/3
746    */
747   __m128 xf = _bli_math_fastpow(0x3f2aaaab, 0x5eb504f3, arg);
748   __m128 xover = _mm_mul_ps(arg, xf);
749   __m128 xfm1 = _mm_rsqrt_ps(xf);
750   __m128 x2 = _mm_mul_ps(arg, arg);
751   __m128 xunder = _mm_mul_ps(x2, xfm1);
752   /* sqrt2 * over + 2 * sqrt2 * under */
753   __m128 xavg = _mm_mul_ps(_mm_set1_ps(1.0f / (3.0f * 0.629960524947437f) * 0.999852f),
754                            _mm_add_ps(xover, xunder));
755   xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg));
756   xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg));
757   return xavg;
758 }
759 
_bli_math_blend_sse(const __m128 mask,const __m128 a,const __m128 b)760 MALWAYS_INLINE __m128 _bli_math_blend_sse(const __m128 mask, const __m128 a, const __m128 b)
761 {
762   return _mm_or_ps(_mm_and_ps(mask, a), _mm_andnot_ps(mask, b));
763 }
764 
765 #endif /* __SSE2__ */
766 
767 /* Low level conversion functions */
unit_float_to_uchar_clamp(float val)768 MINLINE unsigned char unit_float_to_uchar_clamp(float val)
769 {
770   return (unsigned char)((
771       (val <= 0.0f) ? 0 : ((val > (1.0f - 0.5f / 255.0f)) ? 255 : ((255.0f * val) + 0.5f))));
772 }
773 #define unit_float_to_uchar_clamp(val) \
774   ((CHECK_TYPE_INLINE(val, float)), unit_float_to_uchar_clamp(val))
775 
unit_float_to_ushort_clamp(float val)776 MINLINE unsigned short unit_float_to_ushort_clamp(float val)
777 {
778   return (unsigned short)((val >= 1.0f - 0.5f / 65535) ?
779                               65535 :
780                               (val <= 0.0f) ? 0 : (val * 65535.0f + 0.5f));
781 }
782 #define unit_float_to_ushort_clamp(val) \
783   ((CHECK_TYPE_INLINE(val, float)), unit_float_to_ushort_clamp(val))
784 
unit_ushort_to_uchar(unsigned short val)785 MINLINE unsigned char unit_ushort_to_uchar(unsigned short val)
786 {
787   return (unsigned char)(((val) >= 65535 - 128) ? 255 : ((val) + 128) >> 8);
788 }
789 #define unit_ushort_to_uchar(val) \
790   ((CHECK_TYPE_INLINE(val, unsigned short)), unit_ushort_to_uchar(val))
791 
792 #define unit_float_to_uchar_clamp_v3(v1, v2) \
793   { \
794     (v1)[0] = unit_float_to_uchar_clamp((v2[0])); \
795     (v1)[1] = unit_float_to_uchar_clamp((v2[1])); \
796     (v1)[2] = unit_float_to_uchar_clamp((v2[2])); \
797   } \
798   ((void)0)
799 #define unit_float_to_uchar_clamp_v4(v1, v2) \
800   { \
801     (v1)[0] = unit_float_to_uchar_clamp((v2[0])); \
802     (v1)[1] = unit_float_to_uchar_clamp((v2[1])); \
803     (v1)[2] = unit_float_to_uchar_clamp((v2[2])); \
804     (v1)[3] = unit_float_to_uchar_clamp((v2[3])); \
805   } \
806   ((void)0)
807 
808 #ifdef __cplusplus
809 }
810 #endif
811 
812 #endif /* __MATH_BASE_INLINE_C__ */
813