1 /* This file is part of the Spring engine (GPL v2 or later), see LICENSE.html */ 2 3 #ifndef FASTMATH_H 4 #define FASTMATH_H 5 6 #ifndef DEDICATED_NOSSE 7 #include <xmmintrin.h> 8 #endif 9 #include <boost/cstdint.hpp> 10 #include "lib/streflop/streflop_cond.h" 11 #include "System/maindefines.h" 12 13 #ifdef _MSC_VER 14 #define __builtin_sqrtf streflop::sqrtf 15 #endif 16 17 #ifdef __GNUC__ 18 #define _const __attribute__((const)) 19 #define _pure __attribute__((pure)) 20 #define _warn_unused_result __attribute__((warn_unused_result)) 21 #else 22 #define _const 23 #define _pure 24 #define _warn_unused_result 25 #endif 26 27 /** 28 * @brief Fast math routines 29 * 30 * Contains faster alternatives for the more time 31 * consuming standard math routines. These functions 32 * are not as accurate, however, but are generally 33 * acceptable for most applications. 34 * 35 */ 36 37 namespace fastmath { 38 float isqrt_nosse(float) _const; 39 float isqrt_sse(float x) _const; 40 float sqrt_sse(float x) _const; 41 float isqrt_nosse(float x) _const; 42 float isqrt2_nosse(float x) _const; 43 float sqrt(float x) _const; 44 float sqrt2(float x) _const; 45 float apxsqrt(float x) _const; 46 float apxsqrt2(float x) _const; 47 float isqrt(float x) _const; 48 float isqrt2(float x) _const; 49 float sin(float x) _const; 50 float cos(float x) _const; 51 template<typename T> float floor(const T& f) _const; 52 53 /****************** Square root functions ******************/ 54 55 /** 56 * @brief DO NOT USE IN SYNCED CODE. Calculates 1/sqrt(x) using SSE instructions. 57 * 58 * This is much slower than isqrt_nosse (extremely slow on AMDs) and 59 * additionally gives different results on Intel and AMD processors. 60 */ 61 __FORCE_ALIGN_STACK__ isqrt_sse(float x)62 inline float isqrt_sse(float x) 63 { 64 #ifndef DEDICATED_NOSSE 65 __m128 vec = _mm_set_ss(x); 66 vec = _mm_rsqrt_ss(vec); 67 return _mm_cvtss_f32(vec); 68 #else 69 return isqrt_nosse(x); 70 #endif 71 } 72 73 /** 74 * @brief Sync-safe. Calculates square root with using SSE instructions. 75 * 76 * Slower than std::sqrtf, much faster than streflop 77 */ 78 __FORCE_ALIGN_STACK__ sqrt_sse(float x)79 inline float sqrt_sse(float x) 80 { 81 #ifndef DEDICATED_NOSSE 82 __m128 vec = _mm_set_ss(x); 83 vec = _mm_sqrt_ss(vec); 84 return _mm_cvtss_f32(vec); 85 #else 86 return sqrt(x); 87 #endif 88 } 89 90 91 /** 92 * @brief Calculates 1/sqrt(x) with less accuracy 93 * 94 * Gets a very good first guess and then uses one 95 * iteration of newton's method for improved accuracy. 96 * Average relative error: 0.093% 97 * Highest relative error: 0.175% 98 * 99 * see "The Math Behind The Fast Inverse Square Root Function Code" 100 * by Charles McEniry [2007] for a mathematical derivation of this 101 * method (or Chris Lomont's 2003 "Fast Inverse Square Root" paper) 102 * 103 * It has been found to give slightly different results on different Intel CPUs. 104 * Possible cause: 32bit vs 64bit. Use with care. 105 */ isqrt_nosse(float x)106 inline float isqrt_nosse(float x) { 107 float xh = 0.5f * x; 108 boost::int32_t i = *(boost::int32_t*) &x; 109 // "magic number" which makes a very good first guess 110 i = 0x5f375a86 - (i >> 1); 111 x = *(float*) &i; 112 // Newton's method. One iteration for less accuracy but more speed. 113 x = x * (1.5f - xh * (x * x)); 114 return x; 115 } 116 117 /** 118 * @brief Calculates 1/sqrt(x) with more accuracy 119 * 120 * Gets a very good first guess and then uses two 121 * iterations of newton's method for improved accuracy. 122 * Average relative error: 0.00017% 123 * Highest relative error: 0.00047% 124 * 125 */ isqrt2_nosse(float x)126 inline float isqrt2_nosse(float x) { 127 float xh = 0.5f * x; 128 boost::int32_t i = *(boost::int32_t*) &x; 129 // "magic number" which makes a very good first guess 130 i = 0x5f375a86 - (i >> 1); 131 x = *(float*) &i; 132 // Newton's method. Two iterations for more accuracy but less speed. 133 x = x * (1.5f - xh * (x * x)); 134 x = x * (1.5f - xh * (x * x)); 135 return x; 136 137 } 138 139 140 /****************** Square root ******************/ 141 142 143 /** Calculate sqrt using builtin sqrtf. */ sqrt(float x)144 inline float sqrt(float x) 145 { 146 return __builtin_sqrtf(x); 147 } 148 149 /** Calculate sqrt using builtin sqrtf. */ sqrt2(float x)150 inline float sqrt2(float x) 151 { 152 return __builtin_sqrtf(x); 153 } 154 155 /** 156 * @brief A (possibly very) inaccurate and numerically unstable, but fast, version of sqrt. 157 * 158 * Use with care. 159 */ apxsqrt(float x)160 inline float apxsqrt(float x) { 161 return x * isqrt_nosse(x); 162 } 163 164 /** 165 * @brief A (possibly very) inaccurate and numerically unstable, but fast, version of sqrt. 166 * 167 * Use with care. This should be a little bit better, albeit slower, than fastmath::sqrt. 168 */ apxsqrt2(float x)169 inline float apxsqrt2(float x) { 170 return x * isqrt2_nosse(x); 171 } 172 173 /** 174 * @brief Calculates 1/sqrt(x) very quickly. 175 * 176 */ 177 isqrt(float x)178 inline float isqrt(float x) { 179 return isqrt2_nosse(x); 180 } 181 /** 182 * @brief Calculates 1/sqrt(x) very quickly. More accurate but slower than isqrt. 183 * 184 */ isqrt2(float x)185 inline float isqrt2(float x) { 186 return isqrt2_nosse(x); 187 } 188 189 /****************** Trigonometric functions ******************/ 190 191 /** 192 * @brief Pi 193 * 194 * Cherry flavored. 195 */ 196 static const float PI = 3.141592654f; 197 198 /** 199 * @brief Half of pi 200 * 201 * Pi divided by two 202 */ 203 static const float HALFPI = PI / 2.0f; 204 205 /** 206 * @brief Pi times two 207 * 208 * Pi times two 209 */ 210 static const float PI2 = PI * 2.0f; 211 212 /** 213 * @brief Four divided by pi 214 * 215 * Four over pi 216 */ 217 static const float PIU4 = 4.0f / PI; 218 219 /** 220 * @brief Negative four divided by pi squared 221 * 222 * Negative four over (pi squared) 223 */ 224 static const float PISUN4 = -4.0f / (PI * PI); 225 226 /** 227 * @brief reciprocal of pi 228 * 229 * One over (pi times two) 230 */ 231 static const float INVPI2 = 1.0f / PI2; 232 233 /** 234 * @brief negative half pi 235 * 236 * -pi / 2 237 */ 238 static const float NEGHALFPI = -HALFPI; 239 240 /** 241 * @brief calculates the sine of x 242 * 243 * Range reduces x to -PI ... PI, and then uses the 244 * sine approximation method as described at 245 * http://www.devmaster.net/forums/showthread.php?t=5784 246 * 247 * Average percentage error: 0.15281632393574715% 248 * Highest percentage error: 0.17455324009559584% 249 */ sin(float x)250 inline float sin(float x) { 251 /* range reduce to -PI ... PI, as the approximation 252 method only works well for that range. */ 253 x = x - ((int)(x * INVPI2)) * PI2; 254 if (x > HALFPI) { 255 x = -x + PI; 256 } else if (x < NEGHALFPI ) { 257 x = -x - PI; 258 } 259 /* approximation */ 260 x = (PIU4) * x + (PISUN4) * x * math::fabs(x); 261 x = 0.225f * (x * math::fabs(x) - x) + x; 262 return x; 263 } 264 265 /** 266 * @brief calculates the cosine of x 267 * 268 * Adds half of pi to x and then uses the sine method. 269 */ cos(float x)270 inline float cos(float x) { 271 return sin(x + HALFPI); 272 } 273 274 275 /** 276 * @brief fast version of std::floor 277 * 278 * Like 2-3x faster than glibc ones. 279 * Note: The results differ at the end of the 32bit precision range. 280 */ 281 template<typename T> floor(const T & f)282 inline float floor(const T& f) 283 { 284 return (f >= 0) ? int(f) : int(f+0.000001f)-1; 285 } 286 } 287 288 using fastmath::PI; 289 namespace math { 290 // override streflop with faster sqrt! 291 float sqrt(float x) _const; sqrt(float x)292 inline float sqrt(float x) { 293 return fastmath::sqrt_sse(x); 294 } 295 296 using fastmath::isqrt; 297 using fastmath::isqrt2; 298 299 using fastmath::floor; 300 } 301 302 #endif 303