1 /*=====================================================================*
2  *                   Copyright (C) 2012 Paul Mineiro                   *
3  * All rights reserved.                                                *
4  *                                                                     *
5  * Redistribution and use in source and binary forms, with             *
6  * or without modification, are permitted provided that the            *
7  * following conditions are met:                                       *
8  *                                                                     *
9  *     * Redistributions of source code must retain the                *
10  *     above copyright notice, this list of conditions and             *
11  *     the following disclaimer.                                       *
12  *                                                                     *
13  *     * Redistributions in binary form must reproduce the             *
14  *     above copyright notice, this list of conditions and             *
15  *     the following disclaimer in the documentation and/or            *
16  *     other materials provided with the distribution.                 *
17  *                                                                     *
18  *     * Neither the name of Paul Mineiro nor the names                *
19  *     of other contributors may be used to endorse or promote         *
20  *     products derived from this software without specific            *
21  *     prior written permission.                                       *
22  *                                                                     *
23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND              *
24  * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,         *
25  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES               *
26  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE             *
27  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER               *
28  * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                 *
29  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES            *
30  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE           *
31  * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR                *
32  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF          *
33  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT           *
34  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY              *
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             *
36  * POSSIBILITY OF SUCH DAMAGE.                                         *
37  *                                                                     *
38  * Contact: Paul Mineiro <paul@mineiro.com>                            *
39  *=====================================================================*/
40 
41 #ifndef __CAST_H_
42 
43 #ifdef __cplusplus
44 #define cast_uint32_t static_cast<uint32_t>
45 #else
46 #define cast_uint32_t (uint32_t)
47 #endif
48 
49 #endif // __CAST_H_
50 /*=====================================================================*
51  *                   Copyright (C) 2011 Paul Mineiro                   *
52  * All rights reserved.                                                *
53  *                                                                     *
54  * Redistribution and use in source and binary forms, with             *
55  * or without modification, are permitted provided that the            *
56  * following conditions are met:                                       *
57  *                                                                     *
58  *     * Redistributions of source code must retain the                *
59  *     above copyright notice, this list of conditions and             *
60  *     the following disclaimer.                                       *
61  *                                                                     *
62  *     * Redistributions in binary form must reproduce the             *
63  *     above copyright notice, this list of conditions and             *
64  *     the following disclaimer in the documentation and/or            *
65  *     other materials provided with the distribution.                 *
66  *                                                                     *
67  *     * Neither the name of Paul Mineiro nor the names                *
68  *     of other contributors may be used to endorse or promote         *
69  *     products derived from this software without specific            *
70  *     prior written permission.                                       *
71  *                                                                     *
72  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND              *
73  * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,         *
74  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES               *
75  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE             *
76  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER               *
77  * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                 *
78  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES            *
79  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE           *
80  * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR                *
81  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF          *
82  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT           *
83  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY              *
84  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             *
85  * POSSIBILITY OF SUCH DAMAGE.                                         *
86  *                                                                     *
87  * Contact: Paul Mineiro <paul@mineiro.com>                            *
88  *=====================================================================*/
89 
90 #ifndef __SSE_H_
91 #define __SSE_H_
92 
93 #ifdef __SSE2__
94 
95 #include <emmintrin.h>
96 
97 #ifdef __cplusplus
98 namespace {
99 #endif // __cplusplus
100 
101 typedef __m128 v4sf;
102 typedef __m128i v4si;
103 
104 #define v4si_to_v4sf _mm_cvtepi32_ps
105 #define v4sf_to_v4si _mm_cvttps_epi32
106 
107 #if _MSC_VER && !__INTEL_COMPILER
108   template <class T>
GetChar(T value,size_t index)109   __forceinline char GetChar(T value, size_t index) { return ((char*)&value)[index]; }
110 
111   #define AS_4CHARS(a) \
112       GetChar(int32_t(a), 0), GetChar(int32_t(a), 1), \
113       GetChar(int32_t(a), 2), GetChar(int32_t(a), 3)
114 
115   #define _MM_SETR_EPI32(a0, a1, a2, a3) \
116       { AS_4CHARS(a0), AS_4CHARS(a1), AS_4CHARS(a2), AS_4CHARS(a3) }
117 
118   #define v4sfl(x) (const v4sf { (x), (x), (x), (x) })
119   #define v4sil(x) (const v4si _MM_SETR_EPI32(x, x, x, x))
120 
121   __forceinline const v4sf operator+(const v4sf& a, const v4sf& b) { return _mm_add_ps(a,b); }
122   __forceinline const v4sf operator-(const v4sf& a, const v4sf& b) { return _mm_sub_ps(a,b); }
123   __forceinline const v4sf operator/(const v4sf& a, const v4sf& b) { return _mm_div_ps(a,b); }
124   __forceinline const v4sf operator*(const v4sf& a, const v4sf& b) { return _mm_mul_ps(a,b); }
125 
126   __forceinline const v4sf operator+(const v4sf& a) { return a; }
127   __forceinline const v4sf operator-(const v4sf& a) { return _mm_xor_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x80000000))); }
128 
129   __forceinline const v4sf operator&(const v4sf& a, const v4sf& b) { return _mm_and_ps(a,b); }
130   __forceinline const v4sf operator|(const v4sf& a, const v4sf& b) { return _mm_or_ps(a,b); }
131   __forceinline const v4sf operator^(const v4sf& a, const v4sf& b) { return _mm_xor_ps(a,b); }
132 
133   __forceinline const v4si operator&(const v4si& a, const v4si& b) { return _mm_and_si128(a,b); }
134   __forceinline const v4si operator|(const v4si& a, const v4si& b) { return _mm_or_si128(a,b); }
135   __forceinline const v4si operator^(const v4si& a, const v4si& b) { return _mm_xor_si128(a,b); }
136 
137   __forceinline const v4sf operator+=(v4sf& a, const v4sf& b) { return a = a + b; }
138   __forceinline const v4sf operator-=(v4sf& a, const v4sf& b) { return a = a - b; }
139   __forceinline const v4sf operator*=(v4sf& a, const v4sf& b) { return a = a * b; }
140   __forceinline const v4sf operator/=(v4sf& a, const v4sf& b) { return a = a / b; }
141 
142   __forceinline const v4si operator|=(v4si& a, const v4si& b) { return a = a | b; }
143   __forceinline const v4si operator&=(v4si& a, const v4si& b) { return a = a & b; }
144   __forceinline const v4si operator^=(v4si& a, const v4si& b) { return a = a ^ b; }
145 #else
146   #define v4sfl(x) ((const v4sf) { (x), (x), (x), (x) })
147   #define v2dil(x) ((const v4si) { (x), (x) })
148   #define v4sil(x) v2dil((((long long) (x)) << 32) | (long long) (x))
149 #endif
150 
151 typedef union { v4sf f; float array[4]; } v4sfindexer;
152 #define v4sf_index(_findx, _findi)      \
153   ({                                    \
154      v4sfindexer _findvx = { _findx } ; \
155      _findvx.array[_findi];             \
156   })
157 typedef union { v4si i; int array[4]; } v4siindexer;
158 #define v4si_index(_iindx, _iindi)      \
159   ({                                    \
160      v4siindexer _iindvx = { _iindx } ; \
161      _iindvx.array[_iindi];             \
162   })
163 
164 typedef union { v4sf f; v4si i; } v4sfv4sipun;
165 #if _MSC_VER && !__INTEL_COMPILER
166   #define v4sf_fabs(x) _mm_and_ps(x, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)))
167 #else
168   #define v4sf_fabs(x)                  \
169   ({                                    \
170      v4sfv4sipun vx;                    \
171      vx.f = x;                          \
172      vx.i &= v4sil (0x7FFFFFFF);        \
173      vx.f;                              \
174   })
175 #endif
176 
177 #ifdef __cplusplus
178 } // end namespace
179 #endif // __cplusplus
180 
181 #endif // __SSE2__
182 
183 #endif // __SSE_H_
184 /*=====================================================================*
185  *                   Copyright (C) 2011 Paul Mineiro                   *
186  * All rights reserved.                                                *
187  *                                                                     *
188  * Redistribution and use in source and binary forms, with             *
189  * or without modification, are permitted provided that the            *
190  * following conditions are met:                                       *
191  *                                                                     *
192  *     * Redistributions of source code must retain the                *
193  *     above copyright notice, this list of conditions and             *
194  *     the following disclaimer.                                       *
195  *                                                                     *
196  *     * Redistributions in binary form must reproduce the             *
197  *     above copyright notice, this list of conditions and             *
198  *     the following disclaimer in the documentation and/or            *
199  *     other materials provided with the distribution.                 *
200  *                                                                     *
201  *     * Neither the name of Paul Mineiro nor the names                *
202  *     of other contributors may be used to endorse or promote         *
203  *     products derived from this software without specific            *
204  *     prior written permission.                                       *
205  *                                                                     *
206  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND              *
207  * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,         *
208  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES               *
209  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE             *
210  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER               *
211  * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                 *
212  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES            *
213  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE           *
214  * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR                *
215  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF          *
216  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT           *
217  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY              *
218  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             *
219  * POSSIBILITY OF SUCH DAMAGE.                                         *
220  *                                                                     *
221  * Contact: Paul Mineiro <paul@mineiro.com>                            *
222  *=====================================================================*/
223 
224 #ifndef __FAST_EXP_H_
225 #define __FAST_EXP_H_
226 
227 #include <stdint.h>
228 
229 // Underflow of exponential is common practice in numerical routines,
230 // so handle it here.
231 
232 static inline float
fastpow2(float p)233 fastpow2 (float p)
234 {
235   float offset = (p < 0) ? 1.0f : 0.0f;
236   float clipp = (p < -126) ? -126.0f : p;
237   int w = clipp;
238   float z = clipp - w + offset;
239   union { uint32_t i; float f; } v = { cast_uint32_t ( (1 << 23) * (clipp + 121.2740575f + 27.7280233f / (4.84252568f - z) - 1.49012907f * z) ) };
240 
241   return v.f;
242 }
243 
244 static inline float
fastexp(float p)245 fastexp (float p)
246 {
247   return fastpow2 (1.442695040f * p);
248 }
249 
250 static inline float
fasterpow2(float p)251 fasterpow2 (float p)
252 {
253   float clipp = (p < -126) ? -126.0f : p;
254   union { uint32_t i; float f; } v = { cast_uint32_t ( (1 << 23) * (clipp + 126.94269504f) ) };
255   return v.f;
256 }
257 
258 static inline float
fasterexp(float p)259 fasterexp (float p)
260 {
261   return fasterpow2 (1.442695040f * p);
262 }
263 
264 #ifdef __SSE2__
265 
266 static inline v4sf
vfastpow2(const v4sf p)267 vfastpow2 (const v4sf p)
268 {
269   v4sf ltzero = _mm_cmplt_ps (p, v4sfl (0.0f));
270   v4sf offset = _mm_and_ps (ltzero, v4sfl (1.0f));
271   v4sf lt126 = _mm_cmplt_ps (p, v4sfl (-126.0f));
272   v4sf clipp = _mm_or_ps (_mm_andnot_ps (lt126, p), _mm_and_ps (lt126, v4sfl (-126.0f)));
273   v4si w = v4sf_to_v4si (clipp);
274   v4sf z = clipp - v4si_to_v4sf (w) + offset;
275 
276   const v4sf c_121_2740838 = v4sfl (121.2740575f);
277   const v4sf c_27_7280233 = v4sfl (27.7280233f);
278   const v4sf c_4_84252568 = v4sfl (4.84252568f);
279   const v4sf c_1_49012907 = v4sfl (1.49012907f);
280   union { v4si i; v4sf f; } v = {
281     v4sf_to_v4si (
282       v4sfl (1 << 23) *
283       (clipp + c_121_2740838 + c_27_7280233 / (c_4_84252568 - z) - c_1_49012907 * z)
284     )
285   };
286 
287   return v.f;
288 }
289 
290 static inline v4sf
vfastexp(const v4sf p)291 vfastexp (const v4sf p)
292 {
293   const v4sf c_invlog_2 = v4sfl (1.442695040f);
294 
295   return vfastpow2 (c_invlog_2 * p);
296 }
297 
298 static inline v4sf
vfasterpow2(const v4sf p)299 vfasterpow2 (const v4sf p)
300 {
301   const v4sf c_126_94269504 = v4sfl (126.94269504f);
302   v4sf lt126 = _mm_cmplt_ps (p, v4sfl (-126.0f));
303   v4sf clipp = _mm_or_ps (_mm_andnot_ps (lt126, p), _mm_and_ps (lt126, v4sfl (-126.0f)));
304   union { v4si i; v4sf f; } v = { v4sf_to_v4si (v4sfl (1 << 23) * (clipp + c_126_94269504)) };
305   return v.f;
306 }
307 
308 static inline v4sf
vfasterexp(const v4sf p)309 vfasterexp (const v4sf p)
310 {
311   const v4sf c_invlog_2 = v4sfl (1.442695040f);
312 
313   return vfasterpow2 (c_invlog_2 * p);
314 }
315 
316 #endif //__SSE2__
317 
318 #endif // __FAST_EXP_H_
319 /*=====================================================================*
320  *                   Copyright (C) 2011 Paul Mineiro                   *
321  * All rights reserved.                                                *
322  *                                                                     *
323  * Redistribution and use in source and binary forms, with             *
324  * or without modification, are permitted provided that the            *
325  * following conditions are met:                                       *
326  *                                                                     *
327  *     * Redistributions of source code must retain the                *
328  *     above copyright notice, this list of conditions and             *
329  *     the following disclaimer.                                       *
330  *                                                                     *
331  *     * Redistributions in binary form must reproduce the             *
332  *     above copyright notice, this list of conditions and             *
333  *     the following disclaimer in the documentation and/or            *
334  *     other materials provided with the distribution.                 *
335  *                                                                     *
336  *     * Neither the name of Paul Mineiro nor the names                *
337  *     of other contributors may be used to endorse or promote         *
338  *     products derived from this software without specific            *
339  *     prior written permission.                                       *
340  *                                                                     *
341  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND              *
342  * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,         *
343  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES               *
344  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE             *
345  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER               *
346  * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                 *
347  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES            *
348  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE           *
349  * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR                *
350  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF          *
351  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT           *
352  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY              *
353  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             *
354  * POSSIBILITY OF SUCH DAMAGE.                                         *
355  *                                                                     *
356  * Contact: Paul Mineiro <paul@mineiro.com>                            *
357  *=====================================================================*/
358 
359 #ifndef __FAST_LOG_H_
360 #define __FAST_LOG_H_
361 
362 #include <stdint.h>
363 
364 static inline float
fastlog2(float x)365 fastlog2 (float x)
366 {
367   union { float f; uint32_t i; } vx = { x };
368   union { uint32_t i; float f; } mx = { (vx.i & 0x007FFFFF) | 0x3f000000 };
369   float y = vx.i;
370   y *= 1.1920928955078125e-7f;
371 
372   return y - 124.22551499f
373            - 1.498030302f * mx.f
374            - 1.72587999f / (0.3520887068f + mx.f);
375 }
376 
377 static inline float
fastlog(float x)378 fastlog (float x)
379 {
380   return 0.69314718f * fastlog2 (x);
381 }
382 
383 static inline float
fasterlog2(float x)384 fasterlog2 (float x)
385 {
386   union { float f; uint32_t i; } vx = { x };
387   float y = vx.i;
388   y *= 1.1920928955078125e-7f;
389   return y - 126.94269504f;
390 }
391 
392 static inline float
fasterlog(float x)393 fasterlog (float x)
394 {
395 //  return 0.69314718f * fasterlog2 (x);
396 
397   union { float f; uint32_t i; } vx = { x };
398   float y = vx.i;
399   y *= 8.2629582881927490e-8f;
400   return y - 87.989971088f;
401 }
402 
403 #ifdef __SSE2__
404 
405 static inline v4sf
vfastlog2(v4sf x)406 vfastlog2 (v4sf x)
407 {
408   union { v4sf f; v4si i; } vx = { x };
409   union { v4si i; v4sf f; } mx; mx.i = (vx.i & v4sil (0x007FFFFF)) | v4sil (0x3f000000);
410   v4sf y = v4si_to_v4sf (vx.i);
411   y *= v4sfl (1.1920928955078125e-7f);
412 
413   const v4sf c_124_22551499 = v4sfl (124.22551499f);
414   const v4sf c_1_498030302 = v4sfl (1.498030302f);
415   const v4sf c_1_725877999 = v4sfl (1.72587999f);
416   const v4sf c_0_3520087068 = v4sfl (0.3520887068f);
417 
418   return y - c_124_22551499
419            - c_1_498030302 * mx.f
420            - c_1_725877999 / (c_0_3520087068 + mx.f);
421 }
422 
423 static inline v4sf
vfastlog(v4sf x)424 vfastlog (v4sf x)
425 {
426   const v4sf c_0_69314718 = v4sfl (0.69314718f);
427 
428   return c_0_69314718 * vfastlog2 (x);
429 }
430 
431 static inline v4sf
vfasterlog2(v4sf x)432 vfasterlog2 (v4sf x)
433 {
434   union { v4sf f; v4si i; } vx = { x };
435   v4sf y = v4si_to_v4sf (vx.i);
436   y *= v4sfl (1.1920928955078125e-7f);
437 
438   const v4sf c_126_94269504 = v4sfl (126.94269504f);
439 
440   return y - c_126_94269504;
441 }
442 
443 static inline v4sf
vfasterlog(v4sf x)444 vfasterlog (v4sf x)
445 {
446 //  const v4sf c_0_69314718 = v4sfl (0.69314718f);
447 //
448 //  return c_0_69314718 * vfasterlog2 (x);
449 
450   union { v4sf f; v4si i; } vx = { x };
451   v4sf y = v4si_to_v4sf (vx.i);
452   y *= v4sfl (8.2629582881927490e-8f);
453 
454   const v4sf c_87_989971088 = v4sfl (87.989971088f);
455 
456   return y - c_87_989971088;
457 }
458 
459 #endif // __SSE2__
460 
461 #endif // __FAST_LOG_H_
462 /*=====================================================================*
463  *                   Copyright (C) 2011 Paul Mineiro                   *
464  * All rights reserved.                                                *
465  *                                                                     *
466  * Redistribution and use in source and binary forms, with             *
467  * or without modification, are permitted provided that the            *
468  * following conditions are met:                                       *
469  *                                                                     *
470  *     * Redistributions of source code must retain the                *
471  *     above copyright notice, this list of conditions and             *
472  *     the following disclaimer.                                       *
473  *                                                                     *
474  *     * Redistributions in binary form must reproduce the             *
475  *     above copyright notice, this list of conditions and             *
476  *     the following disclaimer in the documentation and/or            *
477  *     other materials provided with the distribution.                 *
478  *                                                                     *
479  *     * Neither the name of Paul Mineiro nor the names                *
480  *     of other contributors may be used to endorse or promote         *
481  *     products derived from this software without specific            *
482  *     prior written permission.                                       *
483  *                                                                     *
484  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND              *
485  * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,         *
486  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES               *
487  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE             *
488  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER               *
489  * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                 *
490  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES            *
491  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE           *
492  * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR                *
493  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF          *
494  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT           *
495  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY              *
496  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             *
497  * POSSIBILITY OF SUCH DAMAGE.                                         *
498  *                                                                     *
499  * Contact: Paul Mineiro <paul@mineiro.com>                            *
500  *=====================================================================*/
501 
502 #ifndef __FAST_ERF_H_
503 #define __FAST_ERF_H_
504 
505 #include <math.h>
506 #include <stdint.h>
507 
508 // fasterfc: not actually faster than erfcf(3) on newer machines!
509 // ... although vectorized version is interesting
510 //     and fastererfc is very fast
511 
512 static inline float
fasterfc(float x)513 fasterfc (float x)
514 {
515   static const float k = 3.3509633149424609f;
516   static const float a = 0.07219054755431126f;
517   static const float b = 15.418191568719577f;
518   static const float c = 5.609846028328545f;
519 
520   union { float f; uint32_t i; } vc = { c * x };
521   float xsq = x * x;
522   float xquad = xsq * xsq;
523 
524   vc.i |= 0x80000000;
525 
526   return 2.0f / (1.0f + fastpow2 (k * x)) - a * x * (b * xquad - 1.0f) * fasterpow2 (vc.f);
527 }
528 
529 static inline float
fastererfc(float x)530 fastererfc (float x)
531 {
532   static const float k = 3.3509633149424609f;
533 
534   return 2.0f / (1.0f + fasterpow2 (k * x));
535 }
536 
537 // fasterf: not actually faster than erff(3) on newer machines!
538 // ... although vectorized version is interesting
539 //     and fastererf is very fast
540 
541 static inline float
fasterf(float x)542 fasterf (float x)
543 {
544   return 1.0f - fasterfc (x);
545 }
546 
547 static inline float
fastererf(float x)548 fastererf (float x)
549 {
550   return 1.0f - fastererfc (x);
551 }
552 
553 static inline float
fastinverseerf(float x)554 fastinverseerf (float x)
555 {
556   static const float invk = 0.30004578719350504f;
557   static const float a = 0.020287853348211326f;
558   static const float b = 0.07236892874789555f;
559   static const float c = 0.9913030456864257f;
560   static const float d = 0.8059775923760193f;
561 
562   float xsq = x * x;
563 
564   return invk * fastlog2 ((1.0f + x) / (1.0f - x))
565        + x * (a - b * xsq) / (c - d * xsq);
566 }
567 
568 static inline float
fasterinverseerf(float x)569 fasterinverseerf (float x)
570 {
571   static const float invk = 0.30004578719350504f;
572 
573   return invk * fasterlog2 ((1.0f + x) / (1.0f - x));
574 }
575 
576 #ifdef __SSE2__
577 
578 static inline v4sf
vfasterfc(v4sf x)579 vfasterfc (v4sf x)
580 {
581   const v4sf k = v4sfl (3.3509633149424609f);
582   const v4sf a = v4sfl (0.07219054755431126f);
583   const v4sf b = v4sfl (15.418191568719577f);
584   const v4sf c = v4sfl (5.609846028328545f);
585 
586   union { v4sf f; v4si i; } vc; vc.f = c * x;
587   vc.i |= v4sil (0x80000000);
588 
589   v4sf xsq = x * x;
590   v4sf xquad = xsq * xsq;
591 
592   return v4sfl (2.0f) / (v4sfl (1.0f) + vfastpow2 (k * x)) - a * x * (b * xquad - v4sfl (1.0f)) * vfasterpow2 (vc.f);
593 }
594 
595 static inline v4sf
vfastererfc(const v4sf x)596 vfastererfc (const v4sf x)
597 {
598   const v4sf k = v4sfl (3.3509633149424609f);
599 
600   return v4sfl (2.0f) / (v4sfl (1.0f) + vfasterpow2 (k * x));
601 }
602 
603 static inline v4sf
vfasterf(v4sf x)604 vfasterf (v4sf x)
605 {
606   return v4sfl (1.0f) - vfasterfc (x);
607 }
608 
609 static inline v4sf
vfastererf(const v4sf x)610 vfastererf (const v4sf x)
611 {
612   return v4sfl (1.0f) - vfastererfc (x);
613 }
614 
615 static inline v4sf
vfastinverseerf(v4sf x)616 vfastinverseerf (v4sf x)
617 {
618   const v4sf invk = v4sfl (0.30004578719350504f);
619   const v4sf a = v4sfl (0.020287853348211326f);
620   const v4sf b = v4sfl (0.07236892874789555f);
621   const v4sf c = v4sfl (0.9913030456864257f);
622   const v4sf d = v4sfl (0.8059775923760193f);
623 
624   v4sf xsq = x * x;
625 
626   return invk * vfastlog2 ((v4sfl (1.0f) + x) / (v4sfl (1.0f) - x))
627        + x * (a - b * xsq) / (c - d * xsq);
628 }
629 
630 static inline v4sf
vfasterinverseerf(v4sf x)631 vfasterinverseerf (v4sf x)
632 {
633   const v4sf invk = v4sfl (0.30004578719350504f);
634 
635   return invk * vfasterlog2 ((v4sfl (1.0f) + x) / (v4sfl (1.0f) - x));
636 }
637 
638 #endif //__SSE2__
639 
640 #endif // __FAST_ERF_H_
641 /*=====================================================================*
642  *                   Copyright (C) 2011 Paul Mineiro                   *
643  * All rights reserved.                                                *
644  *                                                                     *
645  * Redistribution and use in source and binary forms, with             *
646  * or without modification, are permitted provided that the            *
647  * following conditions are met:                                       *
648  *                                                                     *
649  *     * Redistributions of source code must retain the                *
650  *     above copyright notice, this list of conditions and             *
651  *     the following disclaimer.                                       *
652  *                                                                     *
653  *     * Redistributions in binary form must reproduce the             *
654  *     above copyright notice, this list of conditions and             *
655  *     the following disclaimer in the documentation and/or            *
656  *     other materials provided with the distribution.                 *
657  *                                                                     *
658  *     * Neither the name of Paul Mineiro nor the names                *
659  *     of other contributors may be used to endorse or promote         *
660  *     products derived from this software without specific            *
661  *     prior written permission.                                       *
662  *                                                                     *
663  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND              *
664  * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,         *
665  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES               *
666  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE             *
667  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER               *
668  * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                 *
669  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES            *
670  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE           *
671  * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR                *
672  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF          *
673  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT           *
674  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY              *
675  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             *
676  * POSSIBILITY OF SUCH DAMAGE.                                         *
677  *                                                                     *
678  * Contact: Paul Mineiro <paul@mineiro.com>                            *
679  *=====================================================================*/
680 
681 #ifndef __FAST_GAMMA_H_
682 #define __FAST_GAMMA_H_
683 
684 #include <stdint.h>
685 
686 /* gamma/digamma functions only work for positive inputs */
687 
688 static inline float
fastlgamma(float x)689 fastlgamma (float x)
690 {
691   float logterm = fastlog (x * (1.0f + x) * (2.0f + x));
692   float xp3 = 3.0f + x;
693 
694   return - 2.081061466f
695          - x
696          + 0.0833333f / xp3
697          - logterm
698          + (2.5f + x) * fastlog (xp3);
699 }
700 
701 static inline float
fasterlgamma(float x)702 fasterlgamma (float x)
703 {
704   return - 0.0810614667f
705          - x
706          - fasterlog (x)
707          + (0.5f + x) * fasterlog (1.0f + x);
708 }
709 
710 static inline float
fastdigamma(float x)711 fastdigamma (float x)
712 {
713   float twopx = 2.0f + x;
714   float logterm = fastlog (twopx);
715 
716   return (-48.0f + x * (-157.0f + x * (-127.0f - 30.0f * x))) /
717          (12.0f * x * (1.0f + x) * twopx * twopx)
718          + logterm;
719 }
720 
721 static inline float
fasterdigamma(float x)722 fasterdigamma (float x)
723 {
724   float onepx = 1.0f + x;
725 
726   return -1.0f / x - 1.0f / (2 * onepx) + fasterlog (onepx);
727 }
728 
729 #ifdef __SSE2__
730 
731 static inline v4sf
vfastlgamma(v4sf x)732 vfastlgamma (v4sf x)
733 {
734   const v4sf c_1_0 = v4sfl (1.0f);
735   const v4sf c_2_0 = v4sfl (2.0f);
736   const v4sf c_3_0 = v4sfl (3.0f);
737   const v4sf c_2_081061466 = v4sfl (2.081061466f);
738   const v4sf c_0_0833333 = v4sfl (0.0833333f);
739   const v4sf c_2_5 = v4sfl (2.5f);
740 
741   v4sf logterm = vfastlog (x * (c_1_0 + x) * (c_2_0 + x));
742   v4sf xp3 = c_3_0 + x;
743 
744   return - c_2_081061466
745          - x
746          + c_0_0833333 / xp3
747          - logterm
748          + (c_2_5 + x) * vfastlog (xp3);
749 }
750 
751 static inline v4sf
vfasterlgamma(v4sf x)752 vfasterlgamma (v4sf x)
753 {
754   const v4sf c_0_0810614667 = v4sfl (0.0810614667f);
755   const v4sf c_0_5 = v4sfl (0.5f);
756   const v4sf c_1 = v4sfl (1.0f);
757 
758   return - c_0_0810614667
759          - x
760          - vfasterlog (x)
761          + (c_0_5 + x) * vfasterlog (c_1 + x);
762 }
763 
764 static inline v4sf
vfastdigamma(v4sf x)765 vfastdigamma (v4sf x)
766 {
767   v4sf twopx = v4sfl (2.0f) + x;
768   v4sf logterm = vfastlog (twopx);
769 
770   return (v4sfl (-48.0f) + x * (v4sfl (-157.0f) + x * (v4sfl (-127.0f) - v4sfl (30.0f) * x))) /
771          (v4sfl (12.0f) * x * (v4sfl (1.0f) + x) * twopx * twopx)
772          + logterm;
773 }
774 
775 static inline v4sf
vfasterdigamma(v4sf x)776 vfasterdigamma (v4sf x)
777 {
778   const v4sf c_1_0 = v4sfl (1.0f);
779   const v4sf c_2_0 = v4sfl (2.0f);
780   v4sf onepx = c_1_0 + x;
781 
782   return -c_1_0 / x - c_1_0 / (c_2_0 * onepx) + vfasterlog (onepx);
783 }
784 
785 #endif //__SSE2__
786 
787 #endif // __FAST_GAMMA_H_
788 /*=====================================================================*
789  *                   Copyright (C) 2011 Paul Mineiro                   *
790  * All rights reserved.                                                *
791  *                                                                     *
792  * Redistribution and use in source and binary forms, with             *
793  * or without modification, are permitted provided that the            *
794  * following conditions are met:                                       *
795  *                                                                     *
796  *     * Redistributions of source code must retain the                *
797  *     above copyright notice, this list of conditions and             *
798  *     the following disclaimer.                                       *
799  *                                                                     *
800  *     * Redistributions in binary form must reproduce the             *
801  *     above copyright notice, this list of conditions and             *
802  *     the following disclaimer in the documentation and/or            *
803  *     other materials provided with the distribution.                 *
804  *                                                                     *
805  *     * Neither the name of Paul Mineiro nor the names                *
806  *     of other contributors may be used to endorse or promote         *
807  *     products derived from this software without specific            *
808  *     prior written permission.                                       *
809  *                                                                     *
810  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND              *
811  * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,         *
812  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES               *
813  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE             *
814  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER               *
815  * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                 *
816  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES            *
817  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE           *
818  * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR                *
819  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF          *
820  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT           *
821  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY              *
822  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             *
823  * POSSIBILITY OF SUCH DAMAGE.                                         *
824  *                                                                     *
825  * Contact: Paul Mineiro <paul@mineiro.com>                            *
826  *=====================================================================*/
827 
828 #ifndef __FAST_HYPERBOLIC_H_
829 #define __FAST_HYPERBOLIC_H_
830 
831 #include <stdint.h>
832 
833 static inline float
fastsinh(float p)834 fastsinh (float p)
835 {
836   return 0.5f * (fastexp (p) - fastexp (-p));
837 }
838 
839 static inline float
fastersinh(float p)840 fastersinh (float p)
841 {
842   return 0.5f * (fasterexp (p) - fasterexp (-p));
843 }
844 
845 static inline float
fastcosh(float p)846 fastcosh (float p)
847 {
848   return 0.5f * (fastexp (p) + fastexp (-p));
849 }
850 
851 static inline float
fastercosh(float p)852 fastercosh (float p)
853 {
854   return 0.5f * (fasterexp (p) + fasterexp (-p));
855 }
856 
857 static inline float
fasttanh(float p)858 fasttanh (float p)
859 {
860   return -1.0f + 2.0f / (1.0f + fastexp (-2.0f * p));
861 }
862 
863 static inline float
fastertanh(float p)864 fastertanh (float p)
865 {
866   return -1.0f + 2.0f / (1.0f + fasterexp (-2.0f * p));
867 }
868 
869 #ifdef __SSE2__
870 
871 static inline v4sf
vfastsinh(const v4sf p)872 vfastsinh (const v4sf p)
873 {
874   const v4sf c_0_5 = v4sfl (0.5f);
875 
876   return c_0_5 * (vfastexp (p) - vfastexp (-p));
877 }
878 
879 static inline v4sf
vfastersinh(const v4sf p)880 vfastersinh (const v4sf p)
881 {
882   const v4sf c_0_5 = v4sfl (0.5f);
883 
884   return c_0_5 * (vfasterexp (p) - vfasterexp (-p));
885 }
886 
887 static inline v4sf
vfastcosh(const v4sf p)888 vfastcosh (const v4sf p)
889 {
890   const v4sf c_0_5 = v4sfl (0.5f);
891 
892   return c_0_5 * (vfastexp (p) + vfastexp (-p));
893 }
894 
895 static inline v4sf
vfastercosh(const v4sf p)896 vfastercosh (const v4sf p)
897 {
898   const v4sf c_0_5 = v4sfl (0.5f);
899 
900   return c_0_5 * (vfasterexp (p) + vfasterexp (-p));
901 }
902 
903 static inline v4sf
vfasttanh(const v4sf p)904 vfasttanh (const v4sf p)
905 {
906   const v4sf c_1 = v4sfl (1.0f);
907   const v4sf c_2 = v4sfl (2.0f);
908 
909   return -c_1 + c_2 / (c_1 + vfastexp (-c_2 * p));
910 }
911 
912 static inline v4sf
vfastertanh(const v4sf p)913 vfastertanh (const v4sf p)
914 {
915   const v4sf c_1 = v4sfl (1.0f);
916   const v4sf c_2 = v4sfl (2.0f);
917 
918   return -c_1 + c_2 / (c_1 + vfasterexp (-c_2 * p));
919 }
920 
921 #endif //__SSE2__
922 
923 #endif // __FAST_HYPERBOLIC_H_
924 /*=====================================================================*
925  *                   Copyright (C) 2011 Paul Mineiro                   *
926  * All rights reserved.                                                *
927  *                                                                     *
928  * Redistribution and use in source and binary forms, with             *
929  * or without modification, are permitted provided that the            *
930  * following conditions are met:                                       *
931  *                                                                     *
932  *     * Redistributions of source code must retain the                *
933  *     above copyright notice, this list of conditions and             *
934  *     the following disclaimer.                                       *
935  *                                                                     *
936  *     * Redistributions in binary form must reproduce the             *
937  *     above copyright notice, this list of conditions and             *
938  *     the following disclaimer in the documentation and/or            *
939  *     other materials provided with the distribution.                 *
940  *                                                                     *
941  *     * Neither the name of Paul Mineiro nor the names                *
942  *     of other contributors may be used to endorse or promote         *
943  *     products derived from this software without specific            *
944  *     prior written permission.                                       *
945  *                                                                     *
946  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND              *
947  * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,         *
948  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES               *
949  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE             *
950  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER               *
951  * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                 *
952  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES            *
953  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE           *
954  * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR                *
955  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF          *
956  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT           *
957  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY              *
958  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             *
959  * POSSIBILITY OF SUCH DAMAGE.                                         *
960  *                                                                     *
961  * Contact: Paul Mineiro <paul@mineiro.com>                            *
962  *=====================================================================*/
963 
964 #ifndef __FAST_LAMBERT_W_H_
965 #define __FAST_LAMBERT_W_H_
966 
967 #include <stdint.h>
968 
969 // these functions compute the upper branch aka W_0
970 
971 static inline float
fastlambertw(float x)972 fastlambertw (float x)
973 {
974   static const float threshold = 2.26445f;
975 
976   float c = (x < threshold) ? 1.546865557f : 1.0f;
977   float d = (x < threshold) ? 2.250366841f : 0.0f;
978   float a = (x < threshold) ? -0.737769969f : 0.0f;
979 
980   float logterm = fastlog (c * x + d);
981   float loglogterm = fastlog (logterm);
982 
983   float minusw = -a - logterm + loglogterm - loglogterm / logterm;
984   float expminusw = fastexp (minusw);
985   float xexpminusw = x * expminusw;
986   float pexpminusw = xexpminusw - minusw;
987 
988   return (2.0f * xexpminusw - minusw * (4.0f * xexpminusw - minusw * pexpminusw)) /
989          (2.0f + pexpminusw * (2.0f - minusw));
990 }
991 
992 static inline float
fasterlambertw(float x)993 fasterlambertw (float x)
994 {
995   static const float threshold = 2.26445f;
996 
997   float c = (x < threshold) ? 1.546865557f : 1.0f;
998   float d = (x < threshold) ? 2.250366841f : 0.0f;
999   float a = (x < threshold) ? -0.737769969f : 0.0f;
1000 
1001   float logterm = fasterlog (c * x + d);
1002   float loglogterm = fasterlog (logterm);
1003 
1004   float w = a + logterm - loglogterm + loglogterm / logterm;
1005   float expw = fasterexp (-w);
1006 
1007   return (w * w + expw * x) / (1.0f + w);
1008 }
1009 
1010 static inline float
fastlambertwexpx(float x)1011 fastlambertwexpx (float x)
1012 {
1013   static const float k = 1.1765631309f;
1014   static const float a = 0.94537622168f;
1015 
1016   float logarg = fmaxf (x, k);
1017   float powarg = (x < k) ? a * (x - k) : 0;
1018 
1019   float logterm = fastlog (logarg);
1020   float powterm = fasterpow2 (powarg);  // don't need accuracy here
1021 
1022   float w = powterm * (logarg - logterm + logterm / logarg);
1023   float logw = fastlog (w);
1024   float p = x - logw;
1025 
1026   return w * (2.0f + p + w * (3.0f + 2.0f * p)) /
1027          (2.0f - p + w * (5.0f + 2.0f * w));
1028 }
1029 
1030 static inline float
fasterlambertwexpx(float x)1031 fasterlambertwexpx (float x)
1032 {
1033   static const float k = 1.1765631309f;
1034   static const float a = 0.94537622168f;
1035 
1036   float logarg = fmaxf (x, k);
1037   float powarg = (x < k) ? a * (x - k) : 0;
1038 
1039   float logterm = fasterlog (logarg);
1040   float powterm = fasterpow2 (powarg);
1041 
1042   float w = powterm * (logarg - logterm + logterm / logarg);
1043   float logw = fasterlog (w);
1044 
1045   return w * (1.0f + x - logw) / (1.0f + w);
1046 }
1047 
1048 #ifdef __SSE2__
1049 
1050 static inline v4sf
vfastlambertw(v4sf x)1051 vfastlambertw (v4sf x)
1052 {
1053   const v4sf threshold = v4sfl (2.26445f);
1054 
1055   v4sf under = _mm_cmplt_ps (x, threshold);
1056   v4sf c = _mm_or_ps (_mm_and_ps (under, v4sfl (1.546865557f)),
1057                       _mm_andnot_ps (under, v4sfl (1.0f)));
1058   v4sf d = _mm_and_ps (under, v4sfl (2.250366841f));
1059   v4sf a = _mm_and_ps (under, v4sfl (-0.737769969f));
1060 
1061   v4sf logterm = vfastlog (c * x + d);
1062   v4sf loglogterm = vfastlog (logterm);
1063 
1064   v4sf minusw = -a - logterm + loglogterm - loglogterm / logterm;
1065   v4sf expminusw = vfastexp (minusw);
1066   v4sf xexpminusw = x * expminusw;
1067   v4sf pexpminusw = xexpminusw - minusw;
1068 
1069   return (v4sfl (2.0f) * xexpminusw - minusw * (v4sfl (4.0f) * xexpminusw - minusw * pexpminusw)) /
1070          (v4sfl (2.0f) + pexpminusw * (v4sfl (2.0f) - minusw));
1071 }
1072 
1073 static inline v4sf
vfasterlambertw(v4sf x)1074 vfasterlambertw (v4sf x)
1075 {
1076   const v4sf threshold = v4sfl (2.26445f);
1077 
1078   v4sf under = _mm_cmplt_ps (x, threshold);
1079   v4sf c = _mm_or_ps (_mm_and_ps (under, v4sfl (1.546865557f)),
1080                       _mm_andnot_ps (under, v4sfl (1.0f)));
1081   v4sf d = _mm_and_ps (under, v4sfl (2.250366841f));
1082   v4sf a = _mm_and_ps (under, v4sfl (-0.737769969f));
1083 
1084   v4sf logterm = vfasterlog (c * x + d);
1085   v4sf loglogterm = vfasterlog (logterm);
1086 
1087   v4sf w = a + logterm - loglogterm + loglogterm / logterm;
1088   v4sf expw = vfasterexp (-w);
1089 
1090   return (w * w + expw * x) / (v4sfl (1.0f) + w);
1091 }
1092 
1093 static inline v4sf
vfastlambertwexpx(v4sf x)1094 vfastlambertwexpx (v4sf x)
1095 {
1096   const v4sf k = v4sfl (1.1765631309f);
1097   const v4sf a = v4sfl (0.94537622168f);
1098   const v4sf two = v4sfl (2.0f);
1099   const v4sf three = v4sfl (3.0f);
1100   const v4sf five = v4sfl (5.0f);
1101 
1102   v4sf logarg = _mm_max_ps (x, k);
1103   v4sf powarg = _mm_and_ps (_mm_cmplt_ps (x, k), a * (x - k));
1104 
1105   v4sf logterm = vfastlog (logarg);
1106   v4sf powterm = vfasterpow2 (powarg);  // don't need accuracy here
1107 
1108   v4sf w = powterm * (logarg - logterm + logterm / logarg);
1109   v4sf logw = vfastlog (w);
1110   v4sf p = x - logw;
1111 
1112   return w * (two + p + w * (three + two * p)) /
1113          (two - p + w * (five + two * w));
1114 }
1115 
1116 static inline v4sf
vfasterlambertwexpx(v4sf x)1117 vfasterlambertwexpx (v4sf x)
1118 {
1119   const v4sf k = v4sfl (1.1765631309f);
1120   const v4sf a = v4sfl (0.94537622168f);
1121 
1122   v4sf logarg = _mm_max_ps (x, k);
1123   v4sf powarg = _mm_and_ps (_mm_cmplt_ps (x, k), a * (x - k));
1124 
1125   v4sf logterm = vfasterlog (logarg);
1126   v4sf powterm = vfasterpow2 (powarg);
1127 
1128   v4sf w = powterm * (logarg - logterm + logterm / logarg);
1129   v4sf logw = vfasterlog (w);
1130 
1131   return w * (v4sfl (1.0f) + x - logw) / (v4sfl (1.0f) + w);
1132 }
1133 
1134 #endif // __SSE2__
1135 
1136 #endif // __FAST_LAMBERT_W_H_
1137 
1138 /*=====================================================================*
1139  *                   Copyright (C) 2011 Paul Mineiro                   *
1140  * All rights reserved.                                                *
1141  *                                                                     *
1142  * Redistribution and use in source and binary forms, with             *
1143  * or without modification, are permitted provided that the            *
1144  * following conditions are met:                                       *
1145  *                                                                     *
1146  *     * Redistributions of source code must retain the                *
1147  *     above copyright notice, this list of conditions and             *
1148  *     the following disclaimer.                                       *
1149  *                                                                     *
1150  *     * Redistributions in binary form must reproduce the             *
1151  *     above copyright notice, this list of conditions and             *
1152  *     the following disclaimer in the documentation and/or            *
1153  *     other materials provided with the distribution.                 *
1154  *                                                                     *
1155  *     * Neither the name of Paul Mineiro nor the names                *
1156  *     of other contributors may be used to endorse or promote         *
1157  *     products derived from this software without specific            *
1158  *     prior written permission.                                       *
1159  *                                                                     *
1160  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND              *
1161  * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,         *
1162  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES               *
1163  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE             *
1164  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER               *
1165  * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                 *
1166  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES            *
1167  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE           *
1168  * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR                *
1169  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF          *
1170  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT           *
1171  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY              *
1172  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             *
1173  * POSSIBILITY OF SUCH DAMAGE.                                         *
1174  *                                                                     *
1175  * Contact: Paul Mineiro <paul@mineiro.com>                            *
1176  *=====================================================================*/
1177 
1178 #ifndef __FAST_POW_H_
1179 #define __FAST_POW_H_
1180 
1181 #include <stdint.h>
1182 
1183 static inline float
fastpow(float x,float p)1184 fastpow (float x,
1185          float p)
1186 {
1187   return fastpow2 (p * fastlog2 (x));
1188 }
1189 
1190 static inline float
fasterpow(float x,float p)1191 fasterpow (float x,
1192            float p)
1193 {
1194   return fasterpow2 (p * fasterlog2 (x));
1195 }
1196 
1197 #ifdef __SSE2__
1198 
1199 static inline v4sf
vfastpow(const v4sf x,const v4sf p)1200 vfastpow (const v4sf x,
1201           const v4sf p)
1202 {
1203   return vfastpow2 (p * vfastlog2 (x));
1204 }
1205 
1206 static inline v4sf
vfasterpow(const v4sf x,const v4sf p)1207 vfasterpow (const v4sf x,
1208             const v4sf p)
1209 {
1210   return vfasterpow2 (p * vfasterlog2 (x));
1211 }
1212 
1213 #endif //__SSE2__
1214 
1215 #endif // __FAST_POW_H_
1216 /*=====================================================================*
1217  *                   Copyright (C) 2011 Paul Mineiro                   *
1218  * All rights reserved.                                                *
1219  *                                                                     *
1220  * Redistribution and use in source and binary forms, with             *
1221  * or without modification, are permitted provided that the            *
1222  * following conditions are met:                                       *
1223  *                                                                     *
1224  *     * Redistributions of source code must retain the                *
1225  *     above copyright notice, this list of conditions and             *
1226  *     the following disclaimer.                                       *
1227  *                                                                     *
1228  *     * Redistributions in binary form must reproduce the             *
1229  *     above copyright notice, this list of conditions and             *
1230  *     the following disclaimer in the documentation and/or            *
1231  *     other materials provided with the distribution.                 *
1232  *                                                                     *
1233  *     * Neither the name of Paul Mineiro nor the names                *
1234  *     of other contributors may be used to endorse or promote         *
1235  *     products derived from this software without specific            *
1236  *     prior written permission.                                       *
1237  *                                                                     *
1238  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND              *
1239  * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,         *
1240  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES               *
1241  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE             *
1242  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER               *
1243  * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                 *
1244  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES            *
1245  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE           *
1246  * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR                *
1247  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF          *
1248  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT           *
1249  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY              *
1250  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             *
1251  * POSSIBILITY OF SUCH DAMAGE.                                         *
1252  *                                                                     *
1253  * Contact: Paul Mineiro <paul@mineiro.com>                            *
1254  *=====================================================================*/
1255 
1256 #ifndef __FAST_SIGMOID_H_
1257 #define __FAST_SIGMOID_H_
1258 
1259 #include <stdint.h>
1260 
1261 static inline float
fastsigmoid(float x)1262 fastsigmoid (float x)
1263 {
1264   return 1.0f / (1.0f + fastexp (-x));
1265 }
1266 
1267 static inline float
fastersigmoid(float x)1268 fastersigmoid (float x)
1269 {
1270   return 1.0f / (1.0f + fasterexp (-x));
1271 }
1272 
1273 #ifdef __SSE2__
1274 
1275 static inline v4sf
vfastsigmoid(const v4sf x)1276 vfastsigmoid (const v4sf x)
1277 {
1278   const v4sf c_1 = v4sfl (1.0f);
1279 
1280   return c_1 / (c_1 + vfastexp (-x));
1281 }
1282 
1283 static inline v4sf
vfastersigmoid(const v4sf x)1284 vfastersigmoid (const v4sf x)
1285 {
1286   const v4sf c_1 = v4sfl (1.0f);
1287 
1288   return c_1 / (c_1 + vfasterexp (-x));
1289 }
1290 
1291 #endif //__SSE2__
1292 
1293 #endif // __FAST_SIGMOID_H_
1294 /*=====================================================================*
1295  *                   Copyright (C) 2011 Paul Mineiro                   *
1296  * All rights reserved.                                                *
1297  *                                                                     *
1298  * Redistribution and use in source and binary forms, with             *
1299  * or without modification, are permitted provided that the            *
1300  * following conditions are met:                                       *
1301  *                                                                     *
1302  *     * Redistributions of source code must retain the                *
1303  *     above copyright notice, this list of conditions and             *
1304  *     the following disclaimer.                                       *
1305  *                                                                     *
1306  *     * Redistributions in binary form must reproduce the             *
1307  *     above copyright notice, this list of conditions and             *
1308  *     the following disclaimer in the documentation and/or            *
1309  *     other materials provided with the distribution.                 *
1310  *                                                                     *
1311  *     * Neither the name of Paul Mineiro nor the names                *
1312  *     of other contributors may be used to endorse or promote         *
1313  *     products derived from this software without specific            *
1314  *     prior written permission.                                       *
1315  *                                                                     *
1316  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND              *
1317  * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,         *
1318  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES               *
1319  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE             *
1320  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER               *
1321  * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                 *
1322  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES            *
1323  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE           *
1324  * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR                *
1325  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF          *
1326  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT           *
1327  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY              *
1328  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             *
1329  * POSSIBILITY OF SUCH DAMAGE.                                         *
1330  *                                                                     *
1331  * Contact: Paul Mineiro <paul@mineiro.com>                            *
1332  *=====================================================================*/
1333 
1334 #ifndef __FAST_TRIG_H_
1335 #define __FAST_TRIG_H_
1336 
1337 #include <stdint.h>
1338 
1339 // http://www.devmaster.net/forums/showthread.php?t=5784
1340 // fast sine variants are for x \in [ -\pi, pi ]
1341 // fast cosine variants are for x \in [ -\pi, pi ]
1342 // fast tangent variants are for x \in [ -\pi / 2, pi / 2 ]
1343 // "full" versions of functions handle the entire range of inputs
1344 // although the range reduction technique used here will be hopelessly
1345 // inaccurate for |x| >> 1000
1346 //
1347 // WARNING: fastsinfull, fastcosfull, and fasttanfull can be slower than
1348 // libc calls on older machines (!) and on newer machines are only
1349 // slighly faster.  however:
1350 //   * vectorized versions are competitive
1351 //   * faster full versions are competitive
1352 
1353 static inline float
fastsin(float x)1354 fastsin (float x)
1355 {
1356   static const float fouroverpi = 1.2732395447351627f;
1357   static const float fouroverpisq = 0.40528473456935109f;
1358   static const float q = 0.78444488374548933f;
1359   union { float f; uint32_t i; } p = { 0.20363937680730309f };
1360   union { float f; uint32_t i; } r = { 0.015124940802184233f };
1361   union { float f; uint32_t i; } s = { -0.0032225901625579573f };
1362 
1363   union { float f; uint32_t i; } vx = { x };
1364   uint32_t sign = vx.i & 0x80000000;
1365   vx.i = vx.i & 0x7FFFFFFF;
1366 
1367   float qpprox = fouroverpi * x - fouroverpisq * x * vx.f;
1368   float qpproxsq = qpprox * qpprox;
1369 
1370   p.i |= sign;
1371   r.i |= sign;
1372   s.i ^= sign;
1373 
1374   return q * qpprox + qpproxsq * (p.f + qpproxsq * (r.f + qpproxsq * s.f));
1375 }
1376 
1377 static inline float
fastersin(float x)1378 fastersin (float x)
1379 {
1380   static const float fouroverpi = 1.2732395447351627f;
1381   static const float fouroverpisq = 0.40528473456935109f;
1382   static const float q = 0.77633023248007499f;
1383   union { float f; uint32_t i; } p = { 0.22308510060189463f };
1384 
1385   union { float f; uint32_t i; } vx = { x };
1386   uint32_t sign = vx.i & 0x80000000;
1387   vx.i &= 0x7FFFFFFF;
1388 
1389   float qpprox = fouroverpi * x - fouroverpisq * x * vx.f;
1390 
1391   p.i |= sign;
1392 
1393   return qpprox * (q + p.f * qpprox);
1394 }
1395 
1396 static inline float
fastsinfull(float x)1397 fastsinfull (float x)
1398 {
1399   static const float twopi = 6.2831853071795865f;
1400   static const float invtwopi = 0.15915494309189534f;
1401 
1402   int k = x * invtwopi;
1403   float half = (x < 0) ? -0.5f : 0.5f;
1404   return fastsin ((half + k) * twopi - x);
1405 }
1406 
1407 static inline float
fastersinfull(float x)1408 fastersinfull (float x)
1409 {
1410   static const float twopi = 6.2831853071795865f;
1411   static const float invtwopi = 0.15915494309189534f;
1412 
1413   int k = x * invtwopi;
1414   float half = (x < 0) ? -0.5f : 0.5f;
1415   return fastersin ((half + k) * twopi - x);
1416 }
1417 
1418 static inline float
fastcos(float x)1419 fastcos (float x)
1420 {
1421   static const float halfpi = 1.5707963267948966f;
1422   static const float halfpiminustwopi = -4.7123889803846899f;
1423   float offset = (x > halfpi) ? halfpiminustwopi : halfpi;
1424   return fastsin (x + offset);
1425 }
1426 
1427 static inline float
fastercos(float x)1428 fastercos (float x)
1429 {
1430   static const float twooverpi = 0.63661977236758134f;
1431   static const float p = 0.54641335845679634f;
1432 
1433   union { float f; uint32_t i; } vx = { x };
1434   vx.i &= 0x7FFFFFFF;
1435 
1436   float qpprox = 1.0f - twooverpi * vx.f;
1437 
1438   return qpprox + p * qpprox * (1.0f - qpprox * qpprox);
1439 }
1440 
1441 static inline float
fastcosfull(float x)1442 fastcosfull (float x)
1443 {
1444   static const float halfpi = 1.5707963267948966f;
1445   return fastsinfull (x + halfpi);
1446 }
1447 
1448 static inline float
fastercosfull(float x)1449 fastercosfull (float x)
1450 {
1451   static const float halfpi = 1.5707963267948966f;
1452   return fastersinfull (x + halfpi);
1453 }
1454 
1455 static inline float
fasttan(float x)1456 fasttan (float x)
1457 {
1458   static const float halfpi = 1.5707963267948966f;
1459   return fastsin (x) / fastsin (x + halfpi);
1460 }
1461 
1462 static inline float
fastertan(float x)1463 fastertan (float x)
1464 {
1465   return fastersin (x) / fastercos (x);
1466 }
1467 
1468 static inline float
fasttanfull(float x)1469 fasttanfull (float x)
1470 {
1471   static const float twopi = 6.2831853071795865f;
1472   static const float invtwopi = 0.15915494309189534f;
1473 
1474   int k = x * invtwopi;
1475   float half = (x < 0) ? -0.5f : 0.5f;
1476   float xnew = x - (half + k) * twopi;
1477 
1478   return fastsin (xnew) / fastcos (xnew);
1479 }
1480 
1481 static inline float
fastertanfull(float x)1482 fastertanfull (float x)
1483 {
1484   static const float twopi = 6.2831853071795865f;
1485   static const float invtwopi = 0.15915494309189534f;
1486 
1487   int k = x * invtwopi;
1488   float half = (x < 0) ? -0.5f : 0.5f;
1489   float xnew = x - (half + k) * twopi;
1490 
1491   return fastersin (xnew) / fastercos (xnew);
1492 }
1493 
1494 #ifdef __SSE2__
1495 
1496 static inline v4sf
vfastsin(const v4sf x)1497 vfastsin (const v4sf x)
1498 {
1499   const v4sf fouroverpi = v4sfl (1.2732395447351627f);
1500   const v4sf fouroverpisq = v4sfl (0.40528473456935109f);
1501   const v4sf q = v4sfl (0.78444488374548933f);
1502   const v4sf p = v4sfl (0.20363937680730309f);
1503   const v4sf r = v4sfl (0.015124940802184233f);
1504   const v4sf s = v4sfl (-0.0032225901625579573f);
1505 
1506   union { v4sf f; v4si i; } vx = { x };
1507   v4si sign = vx.i & v4sil (0x80000000);
1508   vx.i &= v4sil (0x7FFFFFFF);
1509 
1510   v4sf qpprox = fouroverpi * x - fouroverpisq * x * vx.f;
1511   v4sf qpproxsq = qpprox * qpprox;
1512   union { v4sf f; v4si i; } vy; vy.f = qpproxsq * (p + qpproxsq * (r + qpproxsq * s));
1513   vy.i ^= sign;
1514 
1515   return q * qpprox + vy.f;
1516 }
1517 
1518 static inline v4sf
vfastersin(const v4sf x)1519 vfastersin (const v4sf x)
1520 {
1521   const v4sf fouroverpi = v4sfl (1.2732395447351627f);
1522   const v4sf fouroverpisq = v4sfl (0.40528473456935109f);
1523   const v4sf q = v4sfl (0.77633023248007499f);
1524   const v4sf plit = v4sfl (0.22308510060189463f);
1525   union { v4sf f; v4si i; } p = { plit };
1526 
1527   union { v4sf f; v4si i; } vx = { x };
1528   v4si sign = vx.i & v4sil (0x80000000);
1529   vx.i &= v4sil (0x7FFFFFFF);
1530 
1531   v4sf qpprox = fouroverpi * x - fouroverpisq * x * vx.f;
1532 
1533   p.i |= sign;
1534 
1535   return qpprox * (q + p.f * qpprox);
1536 }
1537 
1538 static inline v4sf
vfastsinfull(const v4sf x)1539 vfastsinfull (const v4sf x)
1540 {
1541   const v4sf twopi = v4sfl (6.2831853071795865f);
1542   const v4sf invtwopi = v4sfl (0.15915494309189534f);
1543 
1544   v4si k = v4sf_to_v4si (x * invtwopi);
1545 
1546   v4sf ltzero = _mm_cmplt_ps (x, v4sfl (0.0f));
1547   v4sf half = _mm_or_ps (_mm_and_ps (ltzero, v4sfl (-0.5f)),
1548                          _mm_andnot_ps (ltzero, v4sfl (0.5f)));
1549 
1550   return vfastsin ((half + v4si_to_v4sf (k)) * twopi - x);
1551 }
1552 
1553 static inline v4sf
vfastersinfull(const v4sf x)1554 vfastersinfull (const v4sf x)
1555 {
1556   const v4sf twopi = v4sfl (6.2831853071795865f);
1557   const v4sf invtwopi = v4sfl (0.15915494309189534f);
1558 
1559   v4si k = v4sf_to_v4si (x * invtwopi);
1560 
1561   v4sf ltzero = _mm_cmplt_ps (x, v4sfl (0.0f));
1562   v4sf half = _mm_or_ps (_mm_and_ps (ltzero, v4sfl (-0.5f)),
1563                          _mm_andnot_ps (ltzero, v4sfl (0.5f)));
1564 
1565   return vfastersin ((half + v4si_to_v4sf (k)) * twopi - x);
1566 }
1567 
1568 static inline v4sf
vfastcos(const v4sf x)1569 vfastcos (const v4sf x)
1570 {
1571   const v4sf halfpi = v4sfl (1.5707963267948966f);
1572   const v4sf halfpiminustwopi = v4sfl (-4.7123889803846899f);
1573   v4sf lthalfpi = _mm_cmpnlt_ps (x, halfpi);
1574   v4sf offset = _mm_or_ps (_mm_and_ps (lthalfpi, halfpiminustwopi),
1575                            _mm_andnot_ps (lthalfpi, halfpi));
1576   return vfastsin (x + offset);
1577 }
1578 
1579 static inline v4sf
vfastercos(v4sf x)1580 vfastercos (v4sf x)
1581 {
1582   const v4sf twooverpi = v4sfl (0.63661977236758134f);
1583   const v4sf p = v4sfl (0.54641335845679634);
1584 
1585   v4sf vx = v4sf_fabs (x);
1586   v4sf qpprox = v4sfl (1.0f) - twooverpi * vx;
1587 
1588   return qpprox + p * qpprox * (v4sfl (1.0f) - qpprox * qpprox);
1589 }
1590 
1591 static inline v4sf
vfastcosfull(const v4sf x)1592 vfastcosfull (const v4sf x)
1593 {
1594   const v4sf halfpi = v4sfl (1.5707963267948966f);
1595   return vfastsinfull (x + halfpi);
1596 }
1597 
1598 static inline v4sf
vfastercosfull(const v4sf x)1599 vfastercosfull (const v4sf x)
1600 {
1601   const v4sf halfpi = v4sfl (1.5707963267948966f);
1602   return vfastersinfull (x + halfpi);
1603 }
1604 
1605 static inline v4sf
vfasttan(const v4sf x)1606 vfasttan (const v4sf x)
1607 {
1608   const v4sf halfpi = v4sfl (1.5707963267948966f);
1609   return vfastsin (x) / vfastsin (x + halfpi);
1610 }
1611 
1612 static inline v4sf
vfastertan(const v4sf x)1613 vfastertan (const v4sf x)
1614 {
1615   return vfastersin (x) / vfastercos (x);
1616 }
1617 
1618 static inline v4sf
vfasttanfull(const v4sf x)1619 vfasttanfull (const v4sf x)
1620 {
1621   const v4sf twopi = v4sfl (6.2831853071795865f);
1622   const v4sf invtwopi = v4sfl (0.15915494309189534f);
1623 
1624   v4si k = v4sf_to_v4si (x * invtwopi);
1625 
1626   v4sf ltzero = _mm_cmplt_ps (x, v4sfl (0.0f));
1627   v4sf half = _mm_or_ps (_mm_and_ps (ltzero, v4sfl (-0.5f)),
1628                          _mm_andnot_ps (ltzero, v4sfl (0.5f)));
1629   v4sf xnew = x - (half + v4si_to_v4sf (k)) * twopi;
1630 
1631   return vfastsin (xnew) / vfastcos (xnew);
1632 }
1633 
1634 static inline v4sf
vfastertanfull(const v4sf x)1635 vfastertanfull (const v4sf x)
1636 {
1637   const v4sf twopi = v4sfl (6.2831853071795865f);
1638   const v4sf invtwopi = v4sfl (0.15915494309189534f);
1639 
1640   v4si k = v4sf_to_v4si (x * invtwopi);
1641 
1642   v4sf ltzero = _mm_cmplt_ps (x, v4sfl (0.0f));
1643   v4sf half = _mm_or_ps (_mm_and_ps (ltzero, v4sfl (-0.5f)),
1644                          _mm_andnot_ps (ltzero, v4sfl (0.5f)));
1645   v4sf xnew = x - (half + v4si_to_v4sf (k)) * twopi;
1646 
1647   return vfastersin (xnew) / vfastercos (xnew);
1648 }
1649 
1650 #endif //__SSE2__
1651 
1652 #endif // __FAST_TRIG_H_
1653