1 /***************************  vectormath_common.h   ****************************
2 * Author:        Agner Fog
3 * Date created:  2014-04-18
4 * Last modified: 2016-11-25
5 * Version:       1.25
6 * Project:       vector classes
7 * Description:
8 * Header file containing common code for inline version of mathematical functions.
9 *
10 * Theory, methods and inspiration based partially on these sources:
11 * > Moshier, Stephen Lloyd Baluk: Methods and programs for mathematical functions.
12 *   Ellis Horwood, 1989.
13 * > VDT library developed on CERN by Danilo Piparo, Thomas Hauth and
14 *   Vincenzo Innocente, 2012, https://svnweb.cern.ch/trac/vdt
15 * > Cephes math library by Stephen L. Moshier 1992,
16 *   http://www.netlib.org/cephes/
17 *
18 * Calculation methods:
19 * Some functions are using Pad� approximations f(x) = P(x)/Q(x)
20 * Most single precision functions are using Taylor expansions
21 *
22 * For detailed instructions, see VectorClass.pdf
23 *
24 * (c) Copyright 2014-2016 GNU General Public License http://www.gnu.org/licenses
25 ******************************************************************************/
26 
27 #ifndef VECTORMATH_COMMON_H
28 #define VECTORMATH_COMMON_H  1
29 
30 #ifdef VECTORMATH_LIB_H
31 #error conflicting header files: vectormath_lib.h for external math functions, other vectormath_xxx.h for inline math functions
32 #endif
33 
34 #include <math.h>
35 #include "vectorclass.h"
36 
37 
38 
39 /******************************************************************************
40                define mathematical constants
41 ******************************************************************************/
42 #define VM_PI       3.14159265358979323846           // pi
43 #define VM_PI_2     1.57079632679489661923           // pi / 2
44 #define VM_PI_4     0.785398163397448309616          // pi / 4
45 #define VM_SQRT2    1.41421356237309504880           // sqrt(2)
46 #define VM_LOG2E    1.44269504088896340736           // 1/log(2)
47 #define VM_LOG10E   0.434294481903251827651          // 1/log(10)
48 #define VM_LOG210   3.321928094887362347808          // log2(10)
49 #define VM_LN2      0.693147180559945309417          // log(2)
50 #define VM_LN10     2.30258509299404568402           // log(10)
51 #define VM_SMALLEST_NORMAL  2.2250738585072014E-308  // smallest normal number, double
52 #define VM_SMALLEST_NORMALF 1.17549435E-38f          // smallest normal number, float
53 
54 #ifdef VCL_NAMESPACE
55 namespace VCL_NAMESPACE {
56 #endif
57 
58 /******************************************************************************
59       templates for producing infinite and nan in desired vector type
60 ******************************************************************************/
61 template <class VTYPE>
62 static inline VTYPE infinite_vec();
63 
64 template <>
65 inline Vec2d infinite_vec<Vec2d>() {
66     return infinite2d();
67 }
68 
69 template <>
70 inline Vec4f infinite_vec<Vec4f>() {
71     return infinite4f();
72 }
73 
74 #if MAX_VECTOR_SIZE >= 256
75 
76 template <>
77 inline Vec4d infinite_vec<Vec4d>() {
78     return infinite4d();
79 }
80 
81 template <>
82 inline Vec8f infinite_vec<Vec8f>() {
83     return infinite8f();
84 }
85 
86 #endif // MAX_VECTOR_SIZE >= 256
87 
88 #if MAX_VECTOR_SIZE >= 512
89 
90 template <>
91 inline Vec8d infinite_vec<Vec8d>() {
92     return infinite8d();
93 }
94 
95 template <>
96 inline Vec16f infinite_vec<Vec16f>() {
97     return infinite16f();
98 }
99 
100 #endif // MAX_VECTOR_SIZE >= 512
101 
102 
103 // template for producing quiet NAN
104 template <class VTYPE>
105 static inline VTYPE nan_vec(int n = 0x100);
106 
107 template <>
108 inline Vec2d nan_vec<Vec2d>(int n) {
109     return nan2d(n);
110 }
111 
112 template <>
113 inline Vec4f nan_vec<Vec4f>(int n) {
114     return nan4f(n);
115 }
116 
117 #if MAX_VECTOR_SIZE >= 256
118 
119 template <>
120 inline Vec4d nan_vec<Vec4d>(int n) {
121     return nan4d(n);
122 }
123 
124 template <>
125 inline Vec8f nan_vec<Vec8f>(int n) {
126     return nan8f(n);
127 }
128 
129 #endif // MAX_VECTOR_SIZE >= 256
130 
131 #if MAX_VECTOR_SIZE >= 512
132 
133 template <>
134 inline Vec8d nan_vec<Vec8d>(int n) {
135     return nan8d(n);
136 }
137 
138 template <>
139 inline Vec16f nan_vec<Vec16f>(int n) {
140     return nan16f(n);
141 }
142 
143 #endif // MAX_VECTOR_SIZE >= 512
144 
145 // Define NAN trace values
146 #define NAN_LOG 0x101  // logarithm for x<0
147 #define NAN_POW 0x102  // negative number raised to non-integer power
148 #define NAN_HYP 0x104  // acosh for x<1 and atanh for abs(x)>1
149 
150 
151 /******************************************************************************
152                   templates for polynomials
153 Using Estrin's scheme to make shorter dependency chains and use FMA, starting
154 longest dependency chains first.
155 ******************************************************************************/
156 
157 // template <typedef VECTYPE, typedef CTYPE>
158 template <class VTYPE, class CTYPE>
polynomial_2(VTYPE const & x,CTYPE c0,CTYPE c1,CTYPE c2)159 static inline VTYPE polynomial_2(VTYPE const & x, CTYPE c0, CTYPE c1, CTYPE c2) {
160     // calculates polynomial c2*x^2 + c1*x + c0
161     // VTYPE may be a vector type, CTYPE is a scalar type
162     VTYPE x2 = x * x;
163     //return = x2 * c2 + (x * c1 + c0);
164     return mul_add(x2, c2, mul_add(x, c1, c0));
165 }
166 
167 template<class VTYPE, class CTYPE>
polynomial_3(VTYPE const & x,CTYPE c0,CTYPE c1,CTYPE c2,CTYPE c3)168 static inline VTYPE polynomial_3(VTYPE const & x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3) {
169     // calculates polynomial c3*x^3 + c2*x^2 + c1*x + c0
170     // VTYPE may be a vector type, CTYPE is a scalar type
171     VTYPE x2 = x * x;
172     //return (c2 + c3*x)*x2 + (c1*x + c0);
173     return mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0));
174 }
175 
176 template<class VTYPE, class CTYPE>
polynomial_4(VTYPE const & x,CTYPE c0,CTYPE c1,CTYPE c2,CTYPE c3,CTYPE c4)177 static inline VTYPE polynomial_4(VTYPE const & x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4) {
178     // calculates polynomial c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
179     // VTYPE may be a vector type, CTYPE is a scalar type
180     VTYPE x2 = x * x;
181     VTYPE x4 = x2 * x2;
182     //return (c2+c3*x)*x2 + ((c0+c1*x) + c4*x4);
183     return mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0) + c4*x4);
184 }
185 
186 template<class VTYPE, class CTYPE>
polynomial_4n(VTYPE const & x,CTYPE c0,CTYPE c1,CTYPE c2,CTYPE c3)187 static inline VTYPE polynomial_4n(VTYPE const & x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3) {
188     // calculates polynomial 1*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
189     // VTYPE may be a vector type, CTYPE is a scalar type
190     VTYPE x2 = x * x;
191     VTYPE x4 = x2 * x2;
192     //return (c2+c3*x)*x2 + ((c0+c1*x) + x4);
193     return mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0) + x4);
194 }
195 
196 template<class VTYPE, class CTYPE>
polynomial_5(VTYPE const & x,CTYPE c0,CTYPE c1,CTYPE c2,CTYPE c3,CTYPE c4,CTYPE c5)197 static inline VTYPE polynomial_5(VTYPE const & x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5) {
198     // calculates polynomial c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
199     // VTYPE may be a vector type, CTYPE is a scalar type
200     VTYPE x2 = x * x;
201     VTYPE x4 = x2 * x2;
202     //return (c2+c3*x)*x2 + ((c4+c5*x)*x4 + (c0+c1*x));
203     return mul_add(mul_add(c3, x, c2), x2, mul_add(mul_add(c5, x, c4), x4, mul_add(c1, x, c0)));
204 }
205 
206 template<class VTYPE, class CTYPE>
polynomial_5n(VTYPE const & x,CTYPE c0,CTYPE c1,CTYPE c2,CTYPE c3,CTYPE c4)207 static inline VTYPE polynomial_5n(VTYPE const & x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4) {
208     // calculates polynomial 1*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
209     // VTYPE may be a vector type, CTYPE is a scalar type
210     VTYPE x2 = x * x;
211     VTYPE x4 = x2 * x2;
212     //return (c2+c3*x)*x2 + ((c4+x)*x4 + (c0+c1*x));
213     return mul_add(mul_add(c3, x, c2), x2, mul_add(c4 + x, x4, mul_add(c1, x, c0)));
214 }
215 
216 template<class VTYPE, class CTYPE>
polynomial_6(VTYPE const & x,CTYPE c0,CTYPE c1,CTYPE c2,CTYPE c3,CTYPE c4,CTYPE c5,CTYPE c6)217 static inline VTYPE polynomial_6(VTYPE const & x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6) {
218     // calculates polynomial c6*x^6 + c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
219     // VTYPE may be a vector type, CTYPE is a scalar type
220     VTYPE x2 = x * x;
221     VTYPE x4 = x2 * x2;
222     //return  (c4+c5*x+c6*x2)*x4 + ((c2+c3*x)*x2 + (c0+c1*x));
223     return mul_add(mul_add(c6, x2, mul_add(c5, x, c4)), x4, mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0)));
224 }
225 
226 template<class VTYPE, class CTYPE>
polynomial_6n(VTYPE const & x,CTYPE c0,CTYPE c1,CTYPE c2,CTYPE c3,CTYPE c4,CTYPE c5)227 static inline VTYPE polynomial_6n(VTYPE const & x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5) {
228     // calculates polynomial 1*x^6 + c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
229     // VTYPE may be a vector type, CTYPE is a scalar type
230     VTYPE x2 = x * x;
231     VTYPE x4 = x2 * x2;
232     //return  (c4+c5*x+x2)*x4 + ((c2+c3*x)*x2 + (c0+c1*x));
233     return mul_add(mul_add(c5, x, c4 + x2), x4, mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0)));
234 }
235 
236 template<class VTYPE, class CTYPE>
polynomial_7(VTYPE const & x,CTYPE c0,CTYPE c1,CTYPE c2,CTYPE c3,CTYPE c4,CTYPE c5,CTYPE c6,CTYPE c7)237 static inline VTYPE polynomial_7(VTYPE const & x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6, CTYPE c7) {
238     // calculates polynomial c7*x^7 + c6*x^6 + c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
239     // VTYPE may be a vector type, CTYPE is a scalar type
240     VTYPE x2 = x * x;
241     VTYPE x4 = x2 * x2;
242     //return  ((c6+c7*x)*x2 + (c4+c5*x))*x4 + ((c2+c3*x)*x2 + (c0+c1*x));
243     return mul_add(mul_add(mul_add(c7, x, c6), x2, mul_add(c5, x, c4)), x4, mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0)));
244 }
245 
246 template<class VTYPE, class CTYPE>
polynomial_8(VTYPE const & x,CTYPE c0,CTYPE c1,CTYPE c2,CTYPE c3,CTYPE c4,CTYPE c5,CTYPE c6,CTYPE c7,CTYPE c8)247 static inline VTYPE polynomial_8(VTYPE const & x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6, CTYPE c7, CTYPE c8) {
248     // calculates polynomial c8*x^8 + c7*x^7 + c6*x^6 + c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
249     // VTYPE may be a vector type, CTYPE is a scalar type
250     VTYPE x2 = x  * x;
251     VTYPE x4 = x2 * x2;
252     VTYPE x8 = x4 * x4;
253     //return  ((c6+c7*x)*x2 + (c4+c5*x))*x4 + (c8*x8 + (c2+c3*x)*x2 + (c0+c1*x));
254     return mul_add(mul_add(mul_add(c7, x, c6), x2, mul_add(c5, x, c4)), x4,
255         mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0) + c8*x8));
256 }
257 
258 template<class VTYPE, class CTYPE>
polynomial_9(VTYPE const & x,CTYPE c0,CTYPE c1,CTYPE c2,CTYPE c3,CTYPE c4,CTYPE c5,CTYPE c6,CTYPE c7,CTYPE c8,CTYPE c9)259 static inline VTYPE polynomial_9(VTYPE const & x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6, CTYPE c7, CTYPE c8, CTYPE c9) {
260     // calculates polynomial c9*x^9 + c8*x^8 + c7*x^7 + c6*x^6 + c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
261     // VTYPE may be a vector type, CTYPE is a scalar type
262     VTYPE x2 = x  * x;
263     VTYPE x4 = x2 * x2;
264     VTYPE x8 = x4 * x4;
265     //return  (((c6+c7*x)*x2 + (c4+c5*x))*x4 + (c8+c9*x)*x8) + ((c2+c3*x)*x2 + (c0+c1*x));
266     return mul_add(mul_add(c9, x, c8), x8, mul_add(
267         mul_add(mul_add(c7, x, c6), x2, mul_add(c5, x, c4)), x4,
268         mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0))));
269 }
270 
271 template<class VTYPE, class CTYPE>
polynomial_10(VTYPE const & x,CTYPE c0,CTYPE c1,CTYPE c2,CTYPE c3,CTYPE c4,CTYPE c5,CTYPE c6,CTYPE c7,CTYPE c8,CTYPE c9,CTYPE c10)272 static inline VTYPE polynomial_10(VTYPE const & x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6, CTYPE c7, CTYPE c8, CTYPE c9, CTYPE c10) {
273     // calculates polynomial c10*x^10 + c9*x^9 + c8*x^8 + c7*x^7 + c6*x^6 + c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
274     // VTYPE may be a vector type, CTYPE is a scalar type
275     VTYPE x2 = x  * x;
276     VTYPE x4 = x2 * x2;
277     VTYPE x8 = x4 * x4;
278     //return  (((c6+c7*x)*x2 + (c4+c5*x))*x4 + (c8+c9*x+c10*x2)*x8) + ((c2+c3*x)*x2 + (c0+c1*x));
279     return mul_add(mul_add(x2, c10, mul_add(c9, x, c8)), x8,
280         mul_add(mul_add(mul_add(c7, x, c6), x2, mul_add(c5, x, c4)), x4,
281             mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0))));
282 }
283 
284 template<class VTYPE, class CTYPE>
polynomial_13(VTYPE const & x,CTYPE c0,CTYPE c1,CTYPE c2,CTYPE c3,CTYPE c4,CTYPE c5,CTYPE c6,CTYPE c7,CTYPE c8,CTYPE c9,CTYPE c10,CTYPE c11,CTYPE c12,CTYPE c13)285 static inline VTYPE polynomial_13(VTYPE const & x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6, CTYPE c7, CTYPE c8, CTYPE c9, CTYPE c10, CTYPE c11, CTYPE c12, CTYPE c13) {
286     // calculates polynomial c13*x^13 + c12*x^12 + ... + c1*x + c0
287     // VTYPE may be a vector type, CTYPE is a scalar type
288     VTYPE x2 = x  * x;
289     VTYPE x4 = x2 * x2;
290     VTYPE x8 = x4 * x4;
291     return mul_add(
292         mul_add(
293             mul_add(c13, x, c12), x4,
294             mul_add(mul_add(c11, x, c10), x2, mul_add(c9, x, c8))), x8,
295         mul_add(
296             mul_add(mul_add(c7, x, c6), x2, mul_add(c5, x, c4)), x4,
297             mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0))));
298 }
299 
300 
301 template<class VTYPE, class CTYPE>
polynomial_13m(VTYPE const & x,CTYPE c2,CTYPE c3,CTYPE c4,CTYPE c5,CTYPE c6,CTYPE c7,CTYPE c8,CTYPE c9,CTYPE c10,CTYPE c11,CTYPE c12,CTYPE c13)302 static inline VTYPE polynomial_13m(VTYPE const & x, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6, CTYPE c7, CTYPE c8, CTYPE c9, CTYPE c10, CTYPE c11, CTYPE c12, CTYPE c13) {
303     // calculates polynomial c13*x^13 + c12*x^12 + ... + x + 0
304     // VTYPE may be a vector type, CTYPE is a scalar type
305     VTYPE x2 = x  * x;
306     VTYPE x4 = x2 * x2;
307     VTYPE x8 = x4 * x4;
308     // return  ((c8+c9*x) + (c10+c11*x)*x2 + (c12+c13*x)*x4)*x8 + (((c6+c7*x)*x2 + (c4+c5*x))*x4 + ((c2+c3*x)*x2 + x));
309     return mul_add(
310         mul_add(mul_add(c13, x, c12), x4, mul_add(mul_add(c11, x, c10), x2, mul_add(c9, x, c8))), x8,
311         mul_add(mul_add(mul_add(c7, x, c6), x2, mul_add(c5, x, c4)), x4, mul_add(mul_add(c3, x, c2), x2, x)));
312 }
313 
314 #ifdef VCL_NAMESPACE
315 }
316 #endif
317 
318 #endif
319