1 /*=====================================================================*
2 * Copyright (C) 2012 Paul Mineiro *
3 * All rights reserved. *
4 * *
5 * Redistribution and use in source and binary forms, with *
6 * or without modification, are permitted provided that the *
7 * following conditions are met: *
8 * *
9 * * Redistributions of source code must retain the *
10 * above copyright notice, this list of conditions and *
11 * the following disclaimer. *
12 * *
13 * * Redistributions in binary form must reproduce the *
14 * above copyright notice, this list of conditions and *
15 * the following disclaimer in the documentation and/or *
16 * other materials provided with the distribution. *
17 * *
18 * * Neither the name of Paul Mineiro nor the names *
19 * of other contributors may be used to endorse or promote *
20 * products derived from this software without specific *
21 * prior written permission. *
22 * *
23 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND *
24 * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, *
25 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES *
26 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE *
27 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER *
28 * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, *
29 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
30 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE *
31 * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR *
32 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF *
33 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT *
34 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY *
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE *
36 * POSSIBILITY OF SUCH DAMAGE. *
37 * *
38 * Contact: Paul Mineiro <paul@mineiro.com> *
39 *=====================================================================*/
40
41 #ifndef __CAST_H_
42
43 #ifdef __cplusplus
44 #define cast_uint32_t static_cast<uint32_t>
45 #else
46 #define cast_uint32_t (uint32_t)
47 #endif
48
49 #endif // __CAST_H_
50 /*=====================================================================*
51 * Copyright (C) 2011 Paul Mineiro *
52 * All rights reserved. *
53 * *
54 * Redistribution and use in source and binary forms, with *
55 * or without modification, are permitted provided that the *
56 * following conditions are met: *
57 * *
58 * * Redistributions of source code must retain the *
59 * above copyright notice, this list of conditions and *
60 * the following disclaimer. *
61 * *
62 * * Redistributions in binary form must reproduce the *
63 * above copyright notice, this list of conditions and *
64 * the following disclaimer in the documentation and/or *
65 * other materials provided with the distribution. *
66 * *
67 * * Neither the name of Paul Mineiro nor the names *
68 * of other contributors may be used to endorse or promote *
69 * products derived from this software without specific *
70 * prior written permission. *
71 * *
72 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND *
73 * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, *
74 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES *
75 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE *
76 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER *
77 * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, *
78 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
79 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE *
80 * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR *
81 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF *
82 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT *
83 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY *
84 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE *
85 * POSSIBILITY OF SUCH DAMAGE. *
86 * *
87 * Contact: Paul Mineiro <paul@mineiro.com> *
88 *=====================================================================*/
89
90 #ifndef __SSE_H_
91 #define __SSE_H_
92
93 #ifdef __SSE2__
94
95 #include <emmintrin.h>
96
97 #ifdef __cplusplus
98 namespace {
99 #endif // __cplusplus
100
101 typedef __m128 v4sf;
102 typedef __m128i v4si;
103
104 #define v4si_to_v4sf _mm_cvtepi32_ps
105 #define v4sf_to_v4si _mm_cvttps_epi32
106
107 #if _MSC_VER && !__INTEL_COMPILER
108 template <class T>
GetChar(T value,size_t index)109 __forceinline char GetChar(T value, size_t index) { return ((char*)&value)[index]; }
110
111 #define AS_4CHARS(a) \
112 GetChar(int32_t(a), 0), GetChar(int32_t(a), 1), \
113 GetChar(int32_t(a), 2), GetChar(int32_t(a), 3)
114
115 #define _MM_SETR_EPI32(a0, a1, a2, a3) \
116 { AS_4CHARS(a0), AS_4CHARS(a1), AS_4CHARS(a2), AS_4CHARS(a3) }
117
118 #define v4sfl(x) (const v4sf { (x), (x), (x), (x) })
119 #define v4sil(x) (const v4si _MM_SETR_EPI32(x, x, x, x))
120
121 __forceinline const v4sf operator+(const v4sf& a, const v4sf& b) { return _mm_add_ps(a,b); }
122 __forceinline const v4sf operator-(const v4sf& a, const v4sf& b) { return _mm_sub_ps(a,b); }
123 __forceinline const v4sf operator/(const v4sf& a, const v4sf& b) { return _mm_div_ps(a,b); }
124 __forceinline const v4sf operator*(const v4sf& a, const v4sf& b) { return _mm_mul_ps(a,b); }
125
126 __forceinline const v4sf operator+(const v4sf& a) { return a; }
127 __forceinline const v4sf operator-(const v4sf& a) { return _mm_xor_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x80000000))); }
128
129 __forceinline const v4sf operator&(const v4sf& a, const v4sf& b) { return _mm_and_ps(a,b); }
130 __forceinline const v4sf operator|(const v4sf& a, const v4sf& b) { return _mm_or_ps(a,b); }
131 __forceinline const v4sf operator^(const v4sf& a, const v4sf& b) { return _mm_xor_ps(a,b); }
132
133 __forceinline const v4si operator&(const v4si& a, const v4si& b) { return _mm_and_si128(a,b); }
134 __forceinline const v4si operator|(const v4si& a, const v4si& b) { return _mm_or_si128(a,b); }
135 __forceinline const v4si operator^(const v4si& a, const v4si& b) { return _mm_xor_si128(a,b); }
136
137 __forceinline const v4sf operator+=(v4sf& a, const v4sf& b) { return a = a + b; }
138 __forceinline const v4sf operator-=(v4sf& a, const v4sf& b) { return a = a - b; }
139 __forceinline const v4sf operator*=(v4sf& a, const v4sf& b) { return a = a * b; }
140 __forceinline const v4sf operator/=(v4sf& a, const v4sf& b) { return a = a / b; }
141
142 __forceinline const v4si operator|=(v4si& a, const v4si& b) { return a = a | b; }
143 __forceinline const v4si operator&=(v4si& a, const v4si& b) { return a = a & b; }
144 __forceinline const v4si operator^=(v4si& a, const v4si& b) { return a = a ^ b; }
145 #else
146 #define v4sfl(x) ((const v4sf) { (x), (x), (x), (x) })
147 #define v2dil(x) ((const v4si) { (x), (x) })
148 #define v4sil(x) v2dil((((long long) (x)) << 32) | (long long) (x))
149 #endif
150
151 typedef union { v4sf f; float array[4]; } v4sfindexer;
152 #define v4sf_index(_findx, _findi) \
153 ({ \
154 v4sfindexer _findvx = { _findx } ; \
155 _findvx.array[_findi]; \
156 })
157 typedef union { v4si i; int array[4]; } v4siindexer;
158 #define v4si_index(_iindx, _iindi) \
159 ({ \
160 v4siindexer _iindvx = { _iindx } ; \
161 _iindvx.array[_iindi]; \
162 })
163
164 typedef union { v4sf f; v4si i; } v4sfv4sipun;
165 #if _MSC_VER && !__INTEL_COMPILER
166 #define v4sf_fabs(x) _mm_and_ps(x, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)))
167 #else
168 #define v4sf_fabs(x) \
169 ({ \
170 v4sfv4sipun vx; \
171 vx.f = x; \
172 vx.i &= v4sil (0x7FFFFFFF); \
173 vx.f; \
174 })
175 #endif
176
177 #ifdef __cplusplus
178 } // end namespace
179 #endif // __cplusplus
180
181 #endif // __SSE2__
182
183 #endif // __SSE_H_
184 /*=====================================================================*
185 * Copyright (C) 2011 Paul Mineiro *
186 * All rights reserved. *
187 * *
188 * Redistribution and use in source and binary forms, with *
189 * or without modification, are permitted provided that the *
190 * following conditions are met: *
191 * *
192 * * Redistributions of source code must retain the *
193 * above copyright notice, this list of conditions and *
194 * the following disclaimer. *
195 * *
196 * * Redistributions in binary form must reproduce the *
197 * above copyright notice, this list of conditions and *
198 * the following disclaimer in the documentation and/or *
199 * other materials provided with the distribution. *
200 * *
201 * * Neither the name of Paul Mineiro nor the names *
202 * of other contributors may be used to endorse or promote *
203 * products derived from this software without specific *
204 * prior written permission. *
205 * *
206 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND *
207 * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, *
208 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES *
209 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE *
210 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER *
211 * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, *
212 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
213 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE *
214 * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR *
215 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF *
216 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT *
217 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY *
218 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE *
219 * POSSIBILITY OF SUCH DAMAGE. *
220 * *
221 * Contact: Paul Mineiro <paul@mineiro.com> *
222 *=====================================================================*/
223
224 #ifndef __FAST_EXP_H_
225 #define __FAST_EXP_H_
226
227 #include <stdint.h>
228
229 // Underflow of exponential is common practice in numerical routines,
230 // so handle it here.
231
232 static inline float
fastpow2(float p)233 fastpow2 (float p)
234 {
235 float offset = (p < 0) ? 1.0f : 0.0f;
236 float clipp = (p < -126) ? -126.0f : p;
237 int w = clipp;
238 float z = clipp - w + offset;
239 union { uint32_t i; float f; } v = { cast_uint32_t ( (1 << 23) * (clipp + 121.2740575f + 27.7280233f / (4.84252568f - z) - 1.49012907f * z) ) };
240
241 return v.f;
242 }
243
244 static inline float
fastexp(float p)245 fastexp (float p)
246 {
247 return fastpow2 (1.442695040f * p);
248 }
249
250 static inline float
fasterpow2(float p)251 fasterpow2 (float p)
252 {
253 float clipp = (p < -126) ? -126.0f : p;
254 union { uint32_t i; float f; } v = { cast_uint32_t ( (1 << 23) * (clipp + 126.94269504f) ) };
255 return v.f;
256 }
257
258 static inline float
fasterexp(float p)259 fasterexp (float p)
260 {
261 return fasterpow2 (1.442695040f * p);
262 }
263
264 #ifdef __SSE2__
265
266 static inline v4sf
vfastpow2(const v4sf p)267 vfastpow2 (const v4sf p)
268 {
269 v4sf ltzero = _mm_cmplt_ps (p, v4sfl (0.0f));
270 v4sf offset = _mm_and_ps (ltzero, v4sfl (1.0f));
271 v4sf lt126 = _mm_cmplt_ps (p, v4sfl (-126.0f));
272 v4sf clipp = _mm_or_ps (_mm_andnot_ps (lt126, p), _mm_and_ps (lt126, v4sfl (-126.0f)));
273 v4si w = v4sf_to_v4si (clipp);
274 v4sf z = clipp - v4si_to_v4sf (w) + offset;
275
276 const v4sf c_121_2740838 = v4sfl (121.2740575f);
277 const v4sf c_27_7280233 = v4sfl (27.7280233f);
278 const v4sf c_4_84252568 = v4sfl (4.84252568f);
279 const v4sf c_1_49012907 = v4sfl (1.49012907f);
280 union { v4si i; v4sf f; } v = {
281 v4sf_to_v4si (
282 v4sfl (1 << 23) *
283 (clipp + c_121_2740838 + c_27_7280233 / (c_4_84252568 - z) - c_1_49012907 * z)
284 )
285 };
286
287 return v.f;
288 }
289
290 static inline v4sf
vfastexp(const v4sf p)291 vfastexp (const v4sf p)
292 {
293 const v4sf c_invlog_2 = v4sfl (1.442695040f);
294
295 return vfastpow2 (c_invlog_2 * p);
296 }
297
298 static inline v4sf
vfasterpow2(const v4sf p)299 vfasterpow2 (const v4sf p)
300 {
301 const v4sf c_126_94269504 = v4sfl (126.94269504f);
302 v4sf lt126 = _mm_cmplt_ps (p, v4sfl (-126.0f));
303 v4sf clipp = _mm_or_ps (_mm_andnot_ps (lt126, p), _mm_and_ps (lt126, v4sfl (-126.0f)));
304 union { v4si i; v4sf f; } v = { v4sf_to_v4si (v4sfl (1 << 23) * (clipp + c_126_94269504)) };
305 return v.f;
306 }
307
308 static inline v4sf
vfasterexp(const v4sf p)309 vfasterexp (const v4sf p)
310 {
311 const v4sf c_invlog_2 = v4sfl (1.442695040f);
312
313 return vfasterpow2 (c_invlog_2 * p);
314 }
315
316 #endif //__SSE2__
317
318 #endif // __FAST_EXP_H_
319 /*=====================================================================*
320 * Copyright (C) 2011 Paul Mineiro *
321 * All rights reserved. *
322 * *
323 * Redistribution and use in source and binary forms, with *
324 * or without modification, are permitted provided that the *
325 * following conditions are met: *
326 * *
327 * * Redistributions of source code must retain the *
328 * above copyright notice, this list of conditions and *
329 * the following disclaimer. *
330 * *
331 * * Redistributions in binary form must reproduce the *
332 * above copyright notice, this list of conditions and *
333 * the following disclaimer in the documentation and/or *
334 * other materials provided with the distribution. *
335 * *
336 * * Neither the name of Paul Mineiro nor the names *
337 * of other contributors may be used to endorse or promote *
338 * products derived from this software without specific *
339 * prior written permission. *
340 * *
341 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND *
342 * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, *
343 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES *
344 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE *
345 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER *
346 * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, *
347 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
348 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE *
349 * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR *
350 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF *
351 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT *
352 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY *
353 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE *
354 * POSSIBILITY OF SUCH DAMAGE. *
355 * *
356 * Contact: Paul Mineiro <paul@mineiro.com> *
357 *=====================================================================*/
358
359 #ifndef __FAST_LOG_H_
360 #define __FAST_LOG_H_
361
362 #include <stdint.h>
363
364 static inline float
fastlog2(float x)365 fastlog2 (float x)
366 {
367 union { float f; uint32_t i; } vx = { x };
368 union { uint32_t i; float f; } mx = { (vx.i & 0x007FFFFF) | 0x3f000000 };
369 float y = vx.i;
370 y *= 1.1920928955078125e-7f;
371
372 return y - 124.22551499f
373 - 1.498030302f * mx.f
374 - 1.72587999f / (0.3520887068f + mx.f);
375 }
376
377 static inline float
fastlog(float x)378 fastlog (float x)
379 {
380 return 0.69314718f * fastlog2 (x);
381 }
382
383 static inline float
fasterlog2(float x)384 fasterlog2 (float x)
385 {
386 union { float f; uint32_t i; } vx = { x };
387 float y = vx.i;
388 y *= 1.1920928955078125e-7f;
389 return y - 126.94269504f;
390 }
391
392 static inline float
fasterlog(float x)393 fasterlog (float x)
394 {
395 // return 0.69314718f * fasterlog2 (x);
396
397 union { float f; uint32_t i; } vx = { x };
398 float y = vx.i;
399 y *= 8.2629582881927490e-8f;
400 return y - 87.989971088f;
401 }
402
403 #ifdef __SSE2__
404
405 static inline v4sf
vfastlog2(v4sf x)406 vfastlog2 (v4sf x)
407 {
408 union { v4sf f; v4si i; } vx = { x };
409 union { v4si i; v4sf f; } mx; mx.i = (vx.i & v4sil (0x007FFFFF)) | v4sil (0x3f000000);
410 v4sf y = v4si_to_v4sf (vx.i);
411 y *= v4sfl (1.1920928955078125e-7f);
412
413 const v4sf c_124_22551499 = v4sfl (124.22551499f);
414 const v4sf c_1_498030302 = v4sfl (1.498030302f);
415 const v4sf c_1_725877999 = v4sfl (1.72587999f);
416 const v4sf c_0_3520087068 = v4sfl (0.3520887068f);
417
418 return y - c_124_22551499
419 - c_1_498030302 * mx.f
420 - c_1_725877999 / (c_0_3520087068 + mx.f);
421 }
422
423 static inline v4sf
vfastlog(v4sf x)424 vfastlog (v4sf x)
425 {
426 const v4sf c_0_69314718 = v4sfl (0.69314718f);
427
428 return c_0_69314718 * vfastlog2 (x);
429 }
430
431 static inline v4sf
vfasterlog2(v4sf x)432 vfasterlog2 (v4sf x)
433 {
434 union { v4sf f; v4si i; } vx = { x };
435 v4sf y = v4si_to_v4sf (vx.i);
436 y *= v4sfl (1.1920928955078125e-7f);
437
438 const v4sf c_126_94269504 = v4sfl (126.94269504f);
439
440 return y - c_126_94269504;
441 }
442
443 static inline v4sf
vfasterlog(v4sf x)444 vfasterlog (v4sf x)
445 {
446 // const v4sf c_0_69314718 = v4sfl (0.69314718f);
447 //
448 // return c_0_69314718 * vfasterlog2 (x);
449
450 union { v4sf f; v4si i; } vx = { x };
451 v4sf y = v4si_to_v4sf (vx.i);
452 y *= v4sfl (8.2629582881927490e-8f);
453
454 const v4sf c_87_989971088 = v4sfl (87.989971088f);
455
456 return y - c_87_989971088;
457 }
458
459 #endif // __SSE2__
460
461 #endif // __FAST_LOG_H_
462 /*=====================================================================*
463 * Copyright (C) 2011 Paul Mineiro *
464 * All rights reserved. *
465 * *
466 * Redistribution and use in source and binary forms, with *
467 * or without modification, are permitted provided that the *
468 * following conditions are met: *
469 * *
470 * * Redistributions of source code must retain the *
471 * above copyright notice, this list of conditions and *
472 * the following disclaimer. *
473 * *
474 * * Redistributions in binary form must reproduce the *
475 * above copyright notice, this list of conditions and *
476 * the following disclaimer in the documentation and/or *
477 * other materials provided with the distribution. *
478 * *
479 * * Neither the name of Paul Mineiro nor the names *
480 * of other contributors may be used to endorse or promote *
481 * products derived from this software without specific *
482 * prior written permission. *
483 * *
484 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND *
485 * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, *
486 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES *
487 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE *
488 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER *
489 * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, *
490 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
491 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE *
492 * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR *
493 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF *
494 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT *
495 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY *
496 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE *
497 * POSSIBILITY OF SUCH DAMAGE. *
498 * *
499 * Contact: Paul Mineiro <paul@mineiro.com> *
500 *=====================================================================*/
501
502 #ifndef __FAST_ERF_H_
503 #define __FAST_ERF_H_
504
505 #include <math.h>
506 #include <stdint.h>
507
508 // fasterfc: not actually faster than erfcf(3) on newer machines!
509 // ... although vectorized version is interesting
510 // and fastererfc is very fast
511
512 static inline float
fasterfc(float x)513 fasterfc (float x)
514 {
515 static const float k = 3.3509633149424609f;
516 static const float a = 0.07219054755431126f;
517 static const float b = 15.418191568719577f;
518 static const float c = 5.609846028328545f;
519
520 union { float f; uint32_t i; } vc = { c * x };
521 float xsq = x * x;
522 float xquad = xsq * xsq;
523
524 vc.i |= 0x80000000;
525
526 return 2.0f / (1.0f + fastpow2 (k * x)) - a * x * (b * xquad - 1.0f) * fasterpow2 (vc.f);
527 }
528
529 static inline float
fastererfc(float x)530 fastererfc (float x)
531 {
532 static const float k = 3.3509633149424609f;
533
534 return 2.0f / (1.0f + fasterpow2 (k * x));
535 }
536
537 // fasterf: not actually faster than erff(3) on newer machines!
538 // ... although vectorized version is interesting
539 // and fastererf is very fast
540
541 static inline float
fasterf(float x)542 fasterf (float x)
543 {
544 return 1.0f - fasterfc (x);
545 }
546
547 static inline float
fastererf(float x)548 fastererf (float x)
549 {
550 return 1.0f - fastererfc (x);
551 }
552
553 static inline float
fastinverseerf(float x)554 fastinverseerf (float x)
555 {
556 static const float invk = 0.30004578719350504f;
557 static const float a = 0.020287853348211326f;
558 static const float b = 0.07236892874789555f;
559 static const float c = 0.9913030456864257f;
560 static const float d = 0.8059775923760193f;
561
562 float xsq = x * x;
563
564 return invk * fastlog2 ((1.0f + x) / (1.0f - x))
565 + x * (a - b * xsq) / (c - d * xsq);
566 }
567
568 static inline float
fasterinverseerf(float x)569 fasterinverseerf (float x)
570 {
571 static const float invk = 0.30004578719350504f;
572
573 return invk * fasterlog2 ((1.0f + x) / (1.0f - x));
574 }
575
576 #ifdef __SSE2__
577
578 static inline v4sf
vfasterfc(v4sf x)579 vfasterfc (v4sf x)
580 {
581 const v4sf k = v4sfl (3.3509633149424609f);
582 const v4sf a = v4sfl (0.07219054755431126f);
583 const v4sf b = v4sfl (15.418191568719577f);
584 const v4sf c = v4sfl (5.609846028328545f);
585
586 union { v4sf f; v4si i; } vc; vc.f = c * x;
587 vc.i |= v4sil (0x80000000);
588
589 v4sf xsq = x * x;
590 v4sf xquad = xsq * xsq;
591
592 return v4sfl (2.0f) / (v4sfl (1.0f) + vfastpow2 (k * x)) - a * x * (b * xquad - v4sfl (1.0f)) * vfasterpow2 (vc.f);
593 }
594
595 static inline v4sf
vfastererfc(const v4sf x)596 vfastererfc (const v4sf x)
597 {
598 const v4sf k = v4sfl (3.3509633149424609f);
599
600 return v4sfl (2.0f) / (v4sfl (1.0f) + vfasterpow2 (k * x));
601 }
602
603 static inline v4sf
vfasterf(v4sf x)604 vfasterf (v4sf x)
605 {
606 return v4sfl (1.0f) - vfasterfc (x);
607 }
608
609 static inline v4sf
vfastererf(const v4sf x)610 vfastererf (const v4sf x)
611 {
612 return v4sfl (1.0f) - vfastererfc (x);
613 }
614
615 static inline v4sf
vfastinverseerf(v4sf x)616 vfastinverseerf (v4sf x)
617 {
618 const v4sf invk = v4sfl (0.30004578719350504f);
619 const v4sf a = v4sfl (0.020287853348211326f);
620 const v4sf b = v4sfl (0.07236892874789555f);
621 const v4sf c = v4sfl (0.9913030456864257f);
622 const v4sf d = v4sfl (0.8059775923760193f);
623
624 v4sf xsq = x * x;
625
626 return invk * vfastlog2 ((v4sfl (1.0f) + x) / (v4sfl (1.0f) - x))
627 + x * (a - b * xsq) / (c - d * xsq);
628 }
629
630 static inline v4sf
vfasterinverseerf(v4sf x)631 vfasterinverseerf (v4sf x)
632 {
633 const v4sf invk = v4sfl (0.30004578719350504f);
634
635 return invk * vfasterlog2 ((v4sfl (1.0f) + x) / (v4sfl (1.0f) - x));
636 }
637
638 #endif //__SSE2__
639
640 #endif // __FAST_ERF_H_
641 /*=====================================================================*
642 * Copyright (C) 2011 Paul Mineiro *
643 * All rights reserved. *
644 * *
645 * Redistribution and use in source and binary forms, with *
646 * or without modification, are permitted provided that the *
647 * following conditions are met: *
648 * *
649 * * Redistributions of source code must retain the *
650 * above copyright notice, this list of conditions and *
651 * the following disclaimer. *
652 * *
653 * * Redistributions in binary form must reproduce the *
654 * above copyright notice, this list of conditions and *
655 * the following disclaimer in the documentation and/or *
656 * other materials provided with the distribution. *
657 * *
658 * * Neither the name of Paul Mineiro nor the names *
659 * of other contributors may be used to endorse or promote *
660 * products derived from this software without specific *
661 * prior written permission. *
662 * *
663 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND *
664 * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, *
665 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES *
666 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE *
667 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER *
668 * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, *
669 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
670 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE *
671 * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR *
672 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF *
673 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT *
674 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY *
675 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE *
676 * POSSIBILITY OF SUCH DAMAGE. *
677 * *
678 * Contact: Paul Mineiro <paul@mineiro.com> *
679 *=====================================================================*/
680
681 #ifndef __FAST_GAMMA_H_
682 #define __FAST_GAMMA_H_
683
684 #include <stdint.h>
685
686 /* gamma/digamma functions only work for positive inputs */
687
688 static inline float
fastlgamma(float x)689 fastlgamma (float x)
690 {
691 float logterm = fastlog (x * (1.0f + x) * (2.0f + x));
692 float xp3 = 3.0f + x;
693
694 return - 2.081061466f
695 - x
696 + 0.0833333f / xp3
697 - logterm
698 + (2.5f + x) * fastlog (xp3);
699 }
700
701 static inline float
fasterlgamma(float x)702 fasterlgamma (float x)
703 {
704 return - 0.0810614667f
705 - x
706 - fasterlog (x)
707 + (0.5f + x) * fasterlog (1.0f + x);
708 }
709
710 static inline float
fastdigamma(float x)711 fastdigamma (float x)
712 {
713 float twopx = 2.0f + x;
714 float logterm = fastlog (twopx);
715
716 return (-48.0f + x * (-157.0f + x * (-127.0f - 30.0f * x))) /
717 (12.0f * x * (1.0f + x) * twopx * twopx)
718 + logterm;
719 }
720
721 static inline float
fasterdigamma(float x)722 fasterdigamma (float x)
723 {
724 float onepx = 1.0f + x;
725
726 return -1.0f / x - 1.0f / (2 * onepx) + fasterlog (onepx);
727 }
728
729 #ifdef __SSE2__
730
731 static inline v4sf
vfastlgamma(v4sf x)732 vfastlgamma (v4sf x)
733 {
734 const v4sf c_1_0 = v4sfl (1.0f);
735 const v4sf c_2_0 = v4sfl (2.0f);
736 const v4sf c_3_0 = v4sfl (3.0f);
737 const v4sf c_2_081061466 = v4sfl (2.081061466f);
738 const v4sf c_0_0833333 = v4sfl (0.0833333f);
739 const v4sf c_2_5 = v4sfl (2.5f);
740
741 v4sf logterm = vfastlog (x * (c_1_0 + x) * (c_2_0 + x));
742 v4sf xp3 = c_3_0 + x;
743
744 return - c_2_081061466
745 - x
746 + c_0_0833333 / xp3
747 - logterm
748 + (c_2_5 + x) * vfastlog (xp3);
749 }
750
751 static inline v4sf
vfasterlgamma(v4sf x)752 vfasterlgamma (v4sf x)
753 {
754 const v4sf c_0_0810614667 = v4sfl (0.0810614667f);
755 const v4sf c_0_5 = v4sfl (0.5f);
756 const v4sf c_1 = v4sfl (1.0f);
757
758 return - c_0_0810614667
759 - x
760 - vfasterlog (x)
761 + (c_0_5 + x) * vfasterlog (c_1 + x);
762 }
763
764 static inline v4sf
vfastdigamma(v4sf x)765 vfastdigamma (v4sf x)
766 {
767 v4sf twopx = v4sfl (2.0f) + x;
768 v4sf logterm = vfastlog (twopx);
769
770 return (v4sfl (-48.0f) + x * (v4sfl (-157.0f) + x * (v4sfl (-127.0f) - v4sfl (30.0f) * x))) /
771 (v4sfl (12.0f) * x * (v4sfl (1.0f) + x) * twopx * twopx)
772 + logterm;
773 }
774
775 static inline v4sf
vfasterdigamma(v4sf x)776 vfasterdigamma (v4sf x)
777 {
778 const v4sf c_1_0 = v4sfl (1.0f);
779 const v4sf c_2_0 = v4sfl (2.0f);
780 v4sf onepx = c_1_0 + x;
781
782 return -c_1_0 / x - c_1_0 / (c_2_0 * onepx) + vfasterlog (onepx);
783 }
784
785 #endif //__SSE2__
786
787 #endif // __FAST_GAMMA_H_
788 /*=====================================================================*
789 * Copyright (C) 2011 Paul Mineiro *
790 * All rights reserved. *
791 * *
792 * Redistribution and use in source and binary forms, with *
793 * or without modification, are permitted provided that the *
794 * following conditions are met: *
795 * *
796 * * Redistributions of source code must retain the *
797 * above copyright notice, this list of conditions and *
798 * the following disclaimer. *
799 * *
800 * * Redistributions in binary form must reproduce the *
801 * above copyright notice, this list of conditions and *
802 * the following disclaimer in the documentation and/or *
803 * other materials provided with the distribution. *
804 * *
805 * * Neither the name of Paul Mineiro nor the names *
806 * of other contributors may be used to endorse or promote *
807 * products derived from this software without specific *
808 * prior written permission. *
809 * *
810 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND *
811 * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, *
812 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES *
813 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE *
814 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER *
815 * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, *
816 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
817 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE *
818 * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR *
819 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF *
820 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT *
821 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY *
822 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE *
823 * POSSIBILITY OF SUCH DAMAGE. *
824 * *
825 * Contact: Paul Mineiro <paul@mineiro.com> *
826 *=====================================================================*/
827
828 #ifndef __FAST_HYPERBOLIC_H_
829 #define __FAST_HYPERBOLIC_H_
830
831 #include <stdint.h>
832
833 static inline float
fastsinh(float p)834 fastsinh (float p)
835 {
836 return 0.5f * (fastexp (p) - fastexp (-p));
837 }
838
839 static inline float
fastersinh(float p)840 fastersinh (float p)
841 {
842 return 0.5f * (fasterexp (p) - fasterexp (-p));
843 }
844
845 static inline float
fastcosh(float p)846 fastcosh (float p)
847 {
848 return 0.5f * (fastexp (p) + fastexp (-p));
849 }
850
851 static inline float
fastercosh(float p)852 fastercosh (float p)
853 {
854 return 0.5f * (fasterexp (p) + fasterexp (-p));
855 }
856
857 static inline float
fasttanh(float p)858 fasttanh (float p)
859 {
860 return -1.0f + 2.0f / (1.0f + fastexp (-2.0f * p));
861 }
862
863 static inline float
fastertanh(float p)864 fastertanh (float p)
865 {
866 return -1.0f + 2.0f / (1.0f + fasterexp (-2.0f * p));
867 }
868
869 #ifdef __SSE2__
870
871 static inline v4sf
vfastsinh(const v4sf p)872 vfastsinh (const v4sf p)
873 {
874 const v4sf c_0_5 = v4sfl (0.5f);
875
876 return c_0_5 * (vfastexp (p) - vfastexp (-p));
877 }
878
879 static inline v4sf
vfastersinh(const v4sf p)880 vfastersinh (const v4sf p)
881 {
882 const v4sf c_0_5 = v4sfl (0.5f);
883
884 return c_0_5 * (vfasterexp (p) - vfasterexp (-p));
885 }
886
887 static inline v4sf
vfastcosh(const v4sf p)888 vfastcosh (const v4sf p)
889 {
890 const v4sf c_0_5 = v4sfl (0.5f);
891
892 return c_0_5 * (vfastexp (p) + vfastexp (-p));
893 }
894
895 static inline v4sf
vfastercosh(const v4sf p)896 vfastercosh (const v4sf p)
897 {
898 const v4sf c_0_5 = v4sfl (0.5f);
899
900 return c_0_5 * (vfasterexp (p) + vfasterexp (-p));
901 }
902
903 static inline v4sf
vfasttanh(const v4sf p)904 vfasttanh (const v4sf p)
905 {
906 const v4sf c_1 = v4sfl (1.0f);
907 const v4sf c_2 = v4sfl (2.0f);
908
909 return -c_1 + c_2 / (c_1 + vfastexp (-c_2 * p));
910 }
911
912 static inline v4sf
vfastertanh(const v4sf p)913 vfastertanh (const v4sf p)
914 {
915 const v4sf c_1 = v4sfl (1.0f);
916 const v4sf c_2 = v4sfl (2.0f);
917
918 return -c_1 + c_2 / (c_1 + vfasterexp (-c_2 * p));
919 }
920
921 #endif //__SSE2__
922
923 #endif // __FAST_HYPERBOLIC_H_
924 /*=====================================================================*
925 * Copyright (C) 2011 Paul Mineiro *
926 * All rights reserved. *
927 * *
928 * Redistribution and use in source and binary forms, with *
929 * or without modification, are permitted provided that the *
930 * following conditions are met: *
931 * *
932 * * Redistributions of source code must retain the *
933 * above copyright notice, this list of conditions and *
934 * the following disclaimer. *
935 * *
936 * * Redistributions in binary form must reproduce the *
937 * above copyright notice, this list of conditions and *
938 * the following disclaimer in the documentation and/or *
939 * other materials provided with the distribution. *
940 * *
941 * * Neither the name of Paul Mineiro nor the names *
942 * of other contributors may be used to endorse or promote *
943 * products derived from this software without specific *
944 * prior written permission. *
945 * *
946 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND *
947 * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, *
948 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES *
949 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE *
950 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER *
951 * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, *
952 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
953 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE *
954 * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR *
955 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF *
956 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT *
957 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY *
958 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE *
959 * POSSIBILITY OF SUCH DAMAGE. *
960 * *
961 * Contact: Paul Mineiro <paul@mineiro.com> *
962 *=====================================================================*/
963
964 #ifndef __FAST_LAMBERT_W_H_
965 #define __FAST_LAMBERT_W_H_
966
967 #include <stdint.h>
968
969 // these functions compute the upper branch aka W_0
970
971 static inline float
fastlambertw(float x)972 fastlambertw (float x)
973 {
974 static const float threshold = 2.26445f;
975
976 float c = (x < threshold) ? 1.546865557f : 1.0f;
977 float d = (x < threshold) ? 2.250366841f : 0.0f;
978 float a = (x < threshold) ? -0.737769969f : 0.0f;
979
980 float logterm = fastlog (c * x + d);
981 float loglogterm = fastlog (logterm);
982
983 float minusw = -a - logterm + loglogterm - loglogterm / logterm;
984 float expminusw = fastexp (minusw);
985 float xexpminusw = x * expminusw;
986 float pexpminusw = xexpminusw - minusw;
987
988 return (2.0f * xexpminusw - minusw * (4.0f * xexpminusw - minusw * pexpminusw)) /
989 (2.0f + pexpminusw * (2.0f - minusw));
990 }
991
992 static inline float
fasterlambertw(float x)993 fasterlambertw (float x)
994 {
995 static const float threshold = 2.26445f;
996
997 float c = (x < threshold) ? 1.546865557f : 1.0f;
998 float d = (x < threshold) ? 2.250366841f : 0.0f;
999 float a = (x < threshold) ? -0.737769969f : 0.0f;
1000
1001 float logterm = fasterlog (c * x + d);
1002 float loglogterm = fasterlog (logterm);
1003
1004 float w = a + logterm - loglogterm + loglogterm / logterm;
1005 float expw = fasterexp (-w);
1006
1007 return (w * w + expw * x) / (1.0f + w);
1008 }
1009
1010 static inline float
fastlambertwexpx(float x)1011 fastlambertwexpx (float x)
1012 {
1013 static const float k = 1.1765631309f;
1014 static const float a = 0.94537622168f;
1015
1016 float logarg = fmaxf (x, k);
1017 float powarg = (x < k) ? a * (x - k) : 0;
1018
1019 float logterm = fastlog (logarg);
1020 float powterm = fasterpow2 (powarg); // don't need accuracy here
1021
1022 float w = powterm * (logarg - logterm + logterm / logarg);
1023 float logw = fastlog (w);
1024 float p = x - logw;
1025
1026 return w * (2.0f + p + w * (3.0f + 2.0f * p)) /
1027 (2.0f - p + w * (5.0f + 2.0f * w));
1028 }
1029
1030 static inline float
fasterlambertwexpx(float x)1031 fasterlambertwexpx (float x)
1032 {
1033 static const float k = 1.1765631309f;
1034 static const float a = 0.94537622168f;
1035
1036 float logarg = fmaxf (x, k);
1037 float powarg = (x < k) ? a * (x - k) : 0;
1038
1039 float logterm = fasterlog (logarg);
1040 float powterm = fasterpow2 (powarg);
1041
1042 float w = powterm * (logarg - logterm + logterm / logarg);
1043 float logw = fasterlog (w);
1044
1045 return w * (1.0f + x - logw) / (1.0f + w);
1046 }
1047
1048 #ifdef __SSE2__
1049
1050 static inline v4sf
vfastlambertw(v4sf x)1051 vfastlambertw (v4sf x)
1052 {
1053 const v4sf threshold = v4sfl (2.26445f);
1054
1055 v4sf under = _mm_cmplt_ps (x, threshold);
1056 v4sf c = _mm_or_ps (_mm_and_ps (under, v4sfl (1.546865557f)),
1057 _mm_andnot_ps (under, v4sfl (1.0f)));
1058 v4sf d = _mm_and_ps (under, v4sfl (2.250366841f));
1059 v4sf a = _mm_and_ps (under, v4sfl (-0.737769969f));
1060
1061 v4sf logterm = vfastlog (c * x + d);
1062 v4sf loglogterm = vfastlog (logterm);
1063
1064 v4sf minusw = -a - logterm + loglogterm - loglogterm / logterm;
1065 v4sf expminusw = vfastexp (minusw);
1066 v4sf xexpminusw = x * expminusw;
1067 v4sf pexpminusw = xexpminusw - minusw;
1068
1069 return (v4sfl (2.0f) * xexpminusw - minusw * (v4sfl (4.0f) * xexpminusw - minusw * pexpminusw)) /
1070 (v4sfl (2.0f) + pexpminusw * (v4sfl (2.0f) - minusw));
1071 }
1072
1073 static inline v4sf
vfasterlambertw(v4sf x)1074 vfasterlambertw (v4sf x)
1075 {
1076 const v4sf threshold = v4sfl (2.26445f);
1077
1078 v4sf under = _mm_cmplt_ps (x, threshold);
1079 v4sf c = _mm_or_ps (_mm_and_ps (under, v4sfl (1.546865557f)),
1080 _mm_andnot_ps (under, v4sfl (1.0f)));
1081 v4sf d = _mm_and_ps (under, v4sfl (2.250366841f));
1082 v4sf a = _mm_and_ps (under, v4sfl (-0.737769969f));
1083
1084 v4sf logterm = vfasterlog (c * x + d);
1085 v4sf loglogterm = vfasterlog (logterm);
1086
1087 v4sf w = a + logterm - loglogterm + loglogterm / logterm;
1088 v4sf expw = vfasterexp (-w);
1089
1090 return (w * w + expw * x) / (v4sfl (1.0f) + w);
1091 }
1092
1093 static inline v4sf
vfastlambertwexpx(v4sf x)1094 vfastlambertwexpx (v4sf x)
1095 {
1096 const v4sf k = v4sfl (1.1765631309f);
1097 const v4sf a = v4sfl (0.94537622168f);
1098 const v4sf two = v4sfl (2.0f);
1099 const v4sf three = v4sfl (3.0f);
1100 const v4sf five = v4sfl (5.0f);
1101
1102 v4sf logarg = _mm_max_ps (x, k);
1103 v4sf powarg = _mm_and_ps (_mm_cmplt_ps (x, k), a * (x - k));
1104
1105 v4sf logterm = vfastlog (logarg);
1106 v4sf powterm = vfasterpow2 (powarg); // don't need accuracy here
1107
1108 v4sf w = powterm * (logarg - logterm + logterm / logarg);
1109 v4sf logw = vfastlog (w);
1110 v4sf p = x - logw;
1111
1112 return w * (two + p + w * (three + two * p)) /
1113 (two - p + w * (five + two * w));
1114 }
1115
1116 static inline v4sf
vfasterlambertwexpx(v4sf x)1117 vfasterlambertwexpx (v4sf x)
1118 {
1119 const v4sf k = v4sfl (1.1765631309f);
1120 const v4sf a = v4sfl (0.94537622168f);
1121
1122 v4sf logarg = _mm_max_ps (x, k);
1123 v4sf powarg = _mm_and_ps (_mm_cmplt_ps (x, k), a * (x - k));
1124
1125 v4sf logterm = vfasterlog (logarg);
1126 v4sf powterm = vfasterpow2 (powarg);
1127
1128 v4sf w = powterm * (logarg - logterm + logterm / logarg);
1129 v4sf logw = vfasterlog (w);
1130
1131 return w * (v4sfl (1.0f) + x - logw) / (v4sfl (1.0f) + w);
1132 }
1133
1134 #endif // __SSE2__
1135
1136 #endif // __FAST_LAMBERT_W_H_
1137
1138 /*=====================================================================*
1139 * Copyright (C) 2011 Paul Mineiro *
1140 * All rights reserved. *
1141 * *
1142 * Redistribution and use in source and binary forms, with *
1143 * or without modification, are permitted provided that the *
1144 * following conditions are met: *
1145 * *
1146 * * Redistributions of source code must retain the *
1147 * above copyright notice, this list of conditions and *
1148 * the following disclaimer. *
1149 * *
1150 * * Redistributions in binary form must reproduce the *
1151 * above copyright notice, this list of conditions and *
1152 * the following disclaimer in the documentation and/or *
1153 * other materials provided with the distribution. *
1154 * *
1155 * * Neither the name of Paul Mineiro nor the names *
1156 * of other contributors may be used to endorse or promote *
1157 * products derived from this software without specific *
1158 * prior written permission. *
1159 * *
1160 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND *
1161 * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, *
1162 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES *
1163 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE *
1164 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER *
1165 * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, *
1166 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
1167 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE *
1168 * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR *
1169 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF *
1170 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT *
1171 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY *
1172 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE *
1173 * POSSIBILITY OF SUCH DAMAGE. *
1174 * *
1175 * Contact: Paul Mineiro <paul@mineiro.com> *
1176 *=====================================================================*/
1177
1178 #ifndef __FAST_POW_H_
1179 #define __FAST_POW_H_
1180
1181 #include <stdint.h>
1182
1183 static inline float
fastpow(float x,float p)1184 fastpow (float x,
1185 float p)
1186 {
1187 return fastpow2 (p * fastlog2 (x));
1188 }
1189
1190 static inline float
fasterpow(float x,float p)1191 fasterpow (float x,
1192 float p)
1193 {
1194 return fasterpow2 (p * fasterlog2 (x));
1195 }
1196
1197 #ifdef __SSE2__
1198
1199 static inline v4sf
vfastpow(const v4sf x,const v4sf p)1200 vfastpow (const v4sf x,
1201 const v4sf p)
1202 {
1203 return vfastpow2 (p * vfastlog2 (x));
1204 }
1205
1206 static inline v4sf
vfasterpow(const v4sf x,const v4sf p)1207 vfasterpow (const v4sf x,
1208 const v4sf p)
1209 {
1210 return vfasterpow2 (p * vfasterlog2 (x));
1211 }
1212
1213 #endif //__SSE2__
1214
1215 #endif // __FAST_POW_H_
1216 /*=====================================================================*
1217 * Copyright (C) 2011 Paul Mineiro *
1218 * All rights reserved. *
1219 * *
1220 * Redistribution and use in source and binary forms, with *
1221 * or without modification, are permitted provided that the *
1222 * following conditions are met: *
1223 * *
1224 * * Redistributions of source code must retain the *
1225 * above copyright notice, this list of conditions and *
1226 * the following disclaimer. *
1227 * *
1228 * * Redistributions in binary form must reproduce the *
1229 * above copyright notice, this list of conditions and *
1230 * the following disclaimer in the documentation and/or *
1231 * other materials provided with the distribution. *
1232 * *
1233 * * Neither the name of Paul Mineiro nor the names *
1234 * of other contributors may be used to endorse or promote *
1235 * products derived from this software without specific *
1236 * prior written permission. *
1237 * *
1238 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND *
1239 * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, *
1240 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES *
1241 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE *
1242 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER *
1243 * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, *
1244 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
1245 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE *
1246 * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR *
1247 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF *
1248 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT *
1249 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY *
1250 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE *
1251 * POSSIBILITY OF SUCH DAMAGE. *
1252 * *
1253 * Contact: Paul Mineiro <paul@mineiro.com> *
1254 *=====================================================================*/
1255
1256 #ifndef __FAST_SIGMOID_H_
1257 #define __FAST_SIGMOID_H_
1258
1259 #include <stdint.h>
1260
1261 static inline float
fastsigmoid(float x)1262 fastsigmoid (float x)
1263 {
1264 return 1.0f / (1.0f + fastexp (-x));
1265 }
1266
1267 static inline float
fastersigmoid(float x)1268 fastersigmoid (float x)
1269 {
1270 return 1.0f / (1.0f + fasterexp (-x));
1271 }
1272
1273 #ifdef __SSE2__
1274
1275 static inline v4sf
vfastsigmoid(const v4sf x)1276 vfastsigmoid (const v4sf x)
1277 {
1278 const v4sf c_1 = v4sfl (1.0f);
1279
1280 return c_1 / (c_1 + vfastexp (-x));
1281 }
1282
1283 static inline v4sf
vfastersigmoid(const v4sf x)1284 vfastersigmoid (const v4sf x)
1285 {
1286 const v4sf c_1 = v4sfl (1.0f);
1287
1288 return c_1 / (c_1 + vfasterexp (-x));
1289 }
1290
1291 #endif //__SSE2__
1292
1293 #endif // __FAST_SIGMOID_H_
1294 /*=====================================================================*
1295 * Copyright (C) 2011 Paul Mineiro *
1296 * All rights reserved. *
1297 * *
1298 * Redistribution and use in source and binary forms, with *
1299 * or without modification, are permitted provided that the *
1300 * following conditions are met: *
1301 * *
1302 * * Redistributions of source code must retain the *
1303 * above copyright notice, this list of conditions and *
1304 * the following disclaimer. *
1305 * *
1306 * * Redistributions in binary form must reproduce the *
1307 * above copyright notice, this list of conditions and *
1308 * the following disclaimer in the documentation and/or *
1309 * other materials provided with the distribution. *
1310 * *
1311 * * Neither the name of Paul Mineiro nor the names *
1312 * of other contributors may be used to endorse or promote *
1313 * products derived from this software without specific *
1314 * prior written permission. *
1315 * *
1316 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND *
1317 * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, *
1318 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES *
1319 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE *
1320 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER *
1321 * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, *
1322 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
1323 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE *
1324 * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR *
1325 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF *
1326 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT *
1327 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY *
1328 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE *
1329 * POSSIBILITY OF SUCH DAMAGE. *
1330 * *
1331 * Contact: Paul Mineiro <paul@mineiro.com> *
1332 *=====================================================================*/
1333
1334 #ifndef __FAST_TRIG_H_
1335 #define __FAST_TRIG_H_
1336
1337 #include <stdint.h>
1338
1339 // http://www.devmaster.net/forums/showthread.php?t=5784
1340 // fast sine variants are for x \in [ -\pi, pi ]
1341 // fast cosine variants are for x \in [ -\pi, pi ]
1342 // fast tangent variants are for x \in [ -\pi / 2, pi / 2 ]
1343 // "full" versions of functions handle the entire range of inputs
1344 // although the range reduction technique used here will be hopelessly
1345 // inaccurate for |x| >> 1000
1346 //
1347 // WARNING: fastsinfull, fastcosfull, and fasttanfull can be slower than
1348 // libc calls on older machines (!) and on newer machines are only
1349 // slighly faster. however:
1350 // * vectorized versions are competitive
1351 // * faster full versions are competitive
1352
1353 static inline float
fastsin(float x)1354 fastsin (float x)
1355 {
1356 static const float fouroverpi = 1.2732395447351627f;
1357 static const float fouroverpisq = 0.40528473456935109f;
1358 static const float q = 0.78444488374548933f;
1359 union { float f; uint32_t i; } p = { 0.20363937680730309f };
1360 union { float f; uint32_t i; } r = { 0.015124940802184233f };
1361 union { float f; uint32_t i; } s = { -0.0032225901625579573f };
1362
1363 union { float f; uint32_t i; } vx = { x };
1364 uint32_t sign = vx.i & 0x80000000;
1365 vx.i = vx.i & 0x7FFFFFFF;
1366
1367 float qpprox = fouroverpi * x - fouroverpisq * x * vx.f;
1368 float qpproxsq = qpprox * qpprox;
1369
1370 p.i |= sign;
1371 r.i |= sign;
1372 s.i ^= sign;
1373
1374 return q * qpprox + qpproxsq * (p.f + qpproxsq * (r.f + qpproxsq * s.f));
1375 }
1376
1377 static inline float
fastersin(float x)1378 fastersin (float x)
1379 {
1380 static const float fouroverpi = 1.2732395447351627f;
1381 static const float fouroverpisq = 0.40528473456935109f;
1382 static const float q = 0.77633023248007499f;
1383 union { float f; uint32_t i; } p = { 0.22308510060189463f };
1384
1385 union { float f; uint32_t i; } vx = { x };
1386 uint32_t sign = vx.i & 0x80000000;
1387 vx.i &= 0x7FFFFFFF;
1388
1389 float qpprox = fouroverpi * x - fouroverpisq * x * vx.f;
1390
1391 p.i |= sign;
1392
1393 return qpprox * (q + p.f * qpprox);
1394 }
1395
1396 static inline float
fastsinfull(float x)1397 fastsinfull (float x)
1398 {
1399 static const float twopi = 6.2831853071795865f;
1400 static const float invtwopi = 0.15915494309189534f;
1401
1402 int k = x * invtwopi;
1403 float half = (x < 0) ? -0.5f : 0.5f;
1404 return fastsin ((half + k) * twopi - x);
1405 }
1406
1407 static inline float
fastersinfull(float x)1408 fastersinfull (float x)
1409 {
1410 static const float twopi = 6.2831853071795865f;
1411 static const float invtwopi = 0.15915494309189534f;
1412
1413 int k = x * invtwopi;
1414 float half = (x < 0) ? -0.5f : 0.5f;
1415 return fastersin ((half + k) * twopi - x);
1416 }
1417
1418 static inline float
fastcos(float x)1419 fastcos (float x)
1420 {
1421 static const float halfpi = 1.5707963267948966f;
1422 static const float halfpiminustwopi = -4.7123889803846899f;
1423 float offset = (x > halfpi) ? halfpiminustwopi : halfpi;
1424 return fastsin (x + offset);
1425 }
1426
1427 static inline float
fastercos(float x)1428 fastercos (float x)
1429 {
1430 static const float twooverpi = 0.63661977236758134f;
1431 static const float p = 0.54641335845679634f;
1432
1433 union { float f; uint32_t i; } vx = { x };
1434 vx.i &= 0x7FFFFFFF;
1435
1436 float qpprox = 1.0f - twooverpi * vx.f;
1437
1438 return qpprox + p * qpprox * (1.0f - qpprox * qpprox);
1439 }
1440
1441 static inline float
fastcosfull(float x)1442 fastcosfull (float x)
1443 {
1444 static const float halfpi = 1.5707963267948966f;
1445 return fastsinfull (x + halfpi);
1446 }
1447
1448 static inline float
fastercosfull(float x)1449 fastercosfull (float x)
1450 {
1451 static const float halfpi = 1.5707963267948966f;
1452 return fastersinfull (x + halfpi);
1453 }
1454
1455 static inline float
fasttan(float x)1456 fasttan (float x)
1457 {
1458 static const float halfpi = 1.5707963267948966f;
1459 return fastsin (x) / fastsin (x + halfpi);
1460 }
1461
1462 static inline float
fastertan(float x)1463 fastertan (float x)
1464 {
1465 return fastersin (x) / fastercos (x);
1466 }
1467
1468 static inline float
fasttanfull(float x)1469 fasttanfull (float x)
1470 {
1471 static const float twopi = 6.2831853071795865f;
1472 static const float invtwopi = 0.15915494309189534f;
1473
1474 int k = x * invtwopi;
1475 float half = (x < 0) ? -0.5f : 0.5f;
1476 float xnew = x - (half + k) * twopi;
1477
1478 return fastsin (xnew) / fastcos (xnew);
1479 }
1480
1481 static inline float
fastertanfull(float x)1482 fastertanfull (float x)
1483 {
1484 static const float twopi = 6.2831853071795865f;
1485 static const float invtwopi = 0.15915494309189534f;
1486
1487 int k = x * invtwopi;
1488 float half = (x < 0) ? -0.5f : 0.5f;
1489 float xnew = x - (half + k) * twopi;
1490
1491 return fastersin (xnew) / fastercos (xnew);
1492 }
1493
1494 #ifdef __SSE2__
1495
1496 static inline v4sf
vfastsin(const v4sf x)1497 vfastsin (const v4sf x)
1498 {
1499 const v4sf fouroverpi = v4sfl (1.2732395447351627f);
1500 const v4sf fouroverpisq = v4sfl (0.40528473456935109f);
1501 const v4sf q = v4sfl (0.78444488374548933f);
1502 const v4sf p = v4sfl (0.20363937680730309f);
1503 const v4sf r = v4sfl (0.015124940802184233f);
1504 const v4sf s = v4sfl (-0.0032225901625579573f);
1505
1506 union { v4sf f; v4si i; } vx = { x };
1507 v4si sign = vx.i & v4sil (0x80000000);
1508 vx.i &= v4sil (0x7FFFFFFF);
1509
1510 v4sf qpprox = fouroverpi * x - fouroverpisq * x * vx.f;
1511 v4sf qpproxsq = qpprox * qpprox;
1512 union { v4sf f; v4si i; } vy; vy.f = qpproxsq * (p + qpproxsq * (r + qpproxsq * s));
1513 vy.i ^= sign;
1514
1515 return q * qpprox + vy.f;
1516 }
1517
1518 static inline v4sf
vfastersin(const v4sf x)1519 vfastersin (const v4sf x)
1520 {
1521 const v4sf fouroverpi = v4sfl (1.2732395447351627f);
1522 const v4sf fouroverpisq = v4sfl (0.40528473456935109f);
1523 const v4sf q = v4sfl (0.77633023248007499f);
1524 const v4sf plit = v4sfl (0.22308510060189463f);
1525 union { v4sf f; v4si i; } p = { plit };
1526
1527 union { v4sf f; v4si i; } vx = { x };
1528 v4si sign = vx.i & v4sil (0x80000000);
1529 vx.i &= v4sil (0x7FFFFFFF);
1530
1531 v4sf qpprox = fouroverpi * x - fouroverpisq * x * vx.f;
1532
1533 p.i |= sign;
1534
1535 return qpprox * (q + p.f * qpprox);
1536 }
1537
1538 static inline v4sf
vfastsinfull(const v4sf x)1539 vfastsinfull (const v4sf x)
1540 {
1541 const v4sf twopi = v4sfl (6.2831853071795865f);
1542 const v4sf invtwopi = v4sfl (0.15915494309189534f);
1543
1544 v4si k = v4sf_to_v4si (x * invtwopi);
1545
1546 v4sf ltzero = _mm_cmplt_ps (x, v4sfl (0.0f));
1547 v4sf half = _mm_or_ps (_mm_and_ps (ltzero, v4sfl (-0.5f)),
1548 _mm_andnot_ps (ltzero, v4sfl (0.5f)));
1549
1550 return vfastsin ((half + v4si_to_v4sf (k)) * twopi - x);
1551 }
1552
1553 static inline v4sf
vfastersinfull(const v4sf x)1554 vfastersinfull (const v4sf x)
1555 {
1556 const v4sf twopi = v4sfl (6.2831853071795865f);
1557 const v4sf invtwopi = v4sfl (0.15915494309189534f);
1558
1559 v4si k = v4sf_to_v4si (x * invtwopi);
1560
1561 v4sf ltzero = _mm_cmplt_ps (x, v4sfl (0.0f));
1562 v4sf half = _mm_or_ps (_mm_and_ps (ltzero, v4sfl (-0.5f)),
1563 _mm_andnot_ps (ltzero, v4sfl (0.5f)));
1564
1565 return vfastersin ((half + v4si_to_v4sf (k)) * twopi - x);
1566 }
1567
1568 static inline v4sf
vfastcos(const v4sf x)1569 vfastcos (const v4sf x)
1570 {
1571 const v4sf halfpi = v4sfl (1.5707963267948966f);
1572 const v4sf halfpiminustwopi = v4sfl (-4.7123889803846899f);
1573 v4sf lthalfpi = _mm_cmpnlt_ps (x, halfpi);
1574 v4sf offset = _mm_or_ps (_mm_and_ps (lthalfpi, halfpiminustwopi),
1575 _mm_andnot_ps (lthalfpi, halfpi));
1576 return vfastsin (x + offset);
1577 }
1578
1579 static inline v4sf
vfastercos(v4sf x)1580 vfastercos (v4sf x)
1581 {
1582 const v4sf twooverpi = v4sfl (0.63661977236758134f);
1583 const v4sf p = v4sfl (0.54641335845679634);
1584
1585 v4sf vx = v4sf_fabs (x);
1586 v4sf qpprox = v4sfl (1.0f) - twooverpi * vx;
1587
1588 return qpprox + p * qpprox * (v4sfl (1.0f) - qpprox * qpprox);
1589 }
1590
1591 static inline v4sf
vfastcosfull(const v4sf x)1592 vfastcosfull (const v4sf x)
1593 {
1594 const v4sf halfpi = v4sfl (1.5707963267948966f);
1595 return vfastsinfull (x + halfpi);
1596 }
1597
1598 static inline v4sf
vfastercosfull(const v4sf x)1599 vfastercosfull (const v4sf x)
1600 {
1601 const v4sf halfpi = v4sfl (1.5707963267948966f);
1602 return vfastersinfull (x + halfpi);
1603 }
1604
1605 static inline v4sf
vfasttan(const v4sf x)1606 vfasttan (const v4sf x)
1607 {
1608 const v4sf halfpi = v4sfl (1.5707963267948966f);
1609 return vfastsin (x) / vfastsin (x + halfpi);
1610 }
1611
1612 static inline v4sf
vfastertan(const v4sf x)1613 vfastertan (const v4sf x)
1614 {
1615 return vfastersin (x) / vfastercos (x);
1616 }
1617
1618 static inline v4sf
vfasttanfull(const v4sf x)1619 vfasttanfull (const v4sf x)
1620 {
1621 const v4sf twopi = v4sfl (6.2831853071795865f);
1622 const v4sf invtwopi = v4sfl (0.15915494309189534f);
1623
1624 v4si k = v4sf_to_v4si (x * invtwopi);
1625
1626 v4sf ltzero = _mm_cmplt_ps (x, v4sfl (0.0f));
1627 v4sf half = _mm_or_ps (_mm_and_ps (ltzero, v4sfl (-0.5f)),
1628 _mm_andnot_ps (ltzero, v4sfl (0.5f)));
1629 v4sf xnew = x - (half + v4si_to_v4sf (k)) * twopi;
1630
1631 return vfastsin (xnew) / vfastcos (xnew);
1632 }
1633
1634 static inline v4sf
vfastertanfull(const v4sf x)1635 vfastertanfull (const v4sf x)
1636 {
1637 const v4sf twopi = v4sfl (6.2831853071795865f);
1638 const v4sf invtwopi = v4sfl (0.15915494309189534f);
1639
1640 v4si k = v4sf_to_v4si (x * invtwopi);
1641
1642 v4sf ltzero = _mm_cmplt_ps (x, v4sfl (0.0f));
1643 v4sf half = _mm_or_ps (_mm_and_ps (ltzero, v4sfl (-0.5f)),
1644 _mm_andnot_ps (ltzero, v4sfl (0.5f)));
1645 v4sf xnew = x - (half + v4si_to_v4sf (k)) * twopi;
1646
1647 return vfastersin (xnew) / vfastercos (xnew);
1648 }
1649
1650 #endif //__SSE2__
1651
1652 #endif // __FAST_TRIG_H_
1653