1 /* This file is part of the Spring engine (GPL v2 or later), see LICENSE.html */
2 
3 #ifndef FASTMATH_H
4 #define FASTMATH_H
5 
6 #ifndef DEDICATED_NOSSE
7 #include <xmmintrin.h>
8 #endif
9 #include <boost/cstdint.hpp>
10 #include "lib/streflop/streflop_cond.h"
11 #include "System/maindefines.h"
12 
13 #ifdef _MSC_VER
14 #define __builtin_sqrtf streflop::sqrtf
15 #endif
16 
17 #ifdef __GNUC__
18 	#define _const __attribute__((const))
19 	#define _pure __attribute__((pure))
20 	#define _warn_unused_result __attribute__((warn_unused_result))
21 #else
22 	#define _const
23 	#define _pure
24 	#define _warn_unused_result
25 #endif
26 
27 /**
28  * @brief Fast math routines
29  *
30  * Contains faster alternatives for the more time
31  * consuming standard math routines. These functions
32  * are not as accurate, however, but are generally
33  * acceptable for most applications.
34  *
35  */
36 
37 namespace fastmath {
38 	float isqrt_nosse(float) _const;
39 	float isqrt_sse(float x) _const;
40 	float sqrt_sse(float x) _const;
41 	float isqrt_nosse(float x) _const;
42 	float isqrt2_nosse(float x) _const;
43 	float sqrt(float x) _const;
44 	float sqrt2(float x) _const;
45 	float apxsqrt(float x) _const;
46 	float apxsqrt2(float x) _const;
47 	float isqrt(float x) _const;
48 	float isqrt2(float x) _const;
49 	float sin(float x) _const;
50 	float cos(float x) _const;
51 	template<typename T> float floor(const T& f) _const;
52 
53 	/****************** Square root functions ******************/
54 
55 	/**
56 	* @brief DO NOT USE IN SYNCED CODE. Calculates 1/sqrt(x) using SSE instructions.
57 	*
58 	* This is much slower than isqrt_nosse (extremely slow on AMDs) and
59 	* additionally gives different results on Intel and AMD processors.
60 	*/
61 	__FORCE_ALIGN_STACK__
isqrt_sse(float x)62 	inline float isqrt_sse(float x)
63 	{
64 #ifndef DEDICATED_NOSSE
65 		__m128 vec = _mm_set_ss(x);
66 		vec = _mm_rsqrt_ss(vec);
67 		return _mm_cvtss_f32(vec);
68 #else
69 		return isqrt_nosse(x);
70 #endif
71 	}
72 
73 	/**
74 	* @brief Sync-safe. Calculates square root with using SSE instructions.
75 	*
76 	* Slower than std::sqrtf, much faster than streflop
77 	*/
78 	__FORCE_ALIGN_STACK__
sqrt_sse(float x)79 	inline float sqrt_sse(float x)
80 	{
81 #ifndef DEDICATED_NOSSE
82 		__m128 vec = _mm_set_ss(x);
83 		vec = _mm_sqrt_ss(vec);
84 		return _mm_cvtss_f32(vec);
85 #else
86 		return sqrt(x);
87 #endif
88 	}
89 
90 
91 	/**
92 	* @brief Calculates 1/sqrt(x) with less accuracy
93 	*
94 	* Gets a very good first guess and then uses one
95 	* iteration of newton's method for improved accuracy.
96 	* Average relative error: 0.093%
97 	* Highest relative error: 0.175%
98 	*
99 	* see "The Math Behind The Fast Inverse Square Root Function Code"
100 	* by Charles McEniry [2007] for a mathematical derivation of this
101 	* method (or Chris Lomont's 2003 "Fast Inverse Square Root" paper)
102 	*
103 	* It has been found to give slightly different results on different Intel CPUs.
104 	* Possible cause: 32bit vs 64bit. Use with care.
105 	*/
isqrt_nosse(float x)106 	inline float isqrt_nosse(float x) {
107 		float xh = 0.5f * x;
108 		boost::int32_t i = *(boost::int32_t*) &x;
109 		// "magic number" which makes a very good first guess
110 		i = 0x5f375a86 - (i >> 1);
111 		x = *(float*) &i;
112 		// Newton's method. One iteration for less accuracy but more speed.
113 		x = x * (1.5f - xh * (x * x));
114 		return x;
115 	}
116 
117 	/**
118 	* @brief Calculates 1/sqrt(x) with more accuracy
119 	*
120 	* Gets a very good first guess and then uses two
121 	* iterations of newton's method for improved accuracy.
122 	* Average relative error: 0.00017%
123 	* Highest relative error: 0.00047%
124 	*
125 	*/
isqrt2_nosse(float x)126 	inline float isqrt2_nosse(float x) {
127 		float xh = 0.5f * x;
128 		boost::int32_t i = *(boost::int32_t*) &x;
129 		// "magic number" which makes a very good first guess
130 		i = 0x5f375a86 - (i >> 1);
131 		x = *(float*) &i;
132 		// Newton's method. Two iterations for more accuracy but less speed.
133 		x = x * (1.5f - xh * (x * x));
134 		x = x * (1.5f - xh * (x * x));
135 		return x;
136 
137 	}
138 
139 
140 	/****************** Square root ******************/
141 
142 
143 	/** Calculate sqrt using builtin sqrtf. */
sqrt(float x)144 	inline float sqrt(float x)
145 	{
146 		return __builtin_sqrtf(x);
147 	}
148 
149 	/** Calculate sqrt using builtin sqrtf. */
sqrt2(float x)150 	inline float sqrt2(float x)
151 	{
152 		return __builtin_sqrtf(x);
153 	}
154 
155 	/**
156 	* @brief A (possibly very) inaccurate and numerically unstable, but fast, version of sqrt.
157 	*
158 	* Use with care.
159 	*/
apxsqrt(float x)160 	inline float apxsqrt(float x) {
161 		return x * isqrt_nosse(x);
162 	}
163 
164 	/**
165 	* @brief A (possibly very) inaccurate and numerically unstable, but fast, version of sqrt.
166 	*
167 	* Use with care. This should be a little bit better, albeit slower, than fastmath::sqrt.
168 	*/
apxsqrt2(float x)169 	inline float apxsqrt2(float x) {
170 		return x * isqrt2_nosse(x);
171 	}
172 
173 	/**
174 	* @brief Calculates 1/sqrt(x) very quickly.
175 	*
176 	*/
177 
isqrt(float x)178 	inline float isqrt(float x) {
179 		return isqrt2_nosse(x);
180 	}
181 	/**
182 	* @brief Calculates 1/sqrt(x) very quickly. More accurate but slower than isqrt.
183 	*
184 	*/
isqrt2(float x)185 	inline float isqrt2(float x) {
186 		return isqrt2_nosse(x);
187 	}
188 
189 	/****************** Trigonometric functions ******************/
190 
191 	/**
192 	* @brief Pi
193 	*
194 	* Cherry flavored.
195 	*/
196 	static const float PI = 3.141592654f;
197 
198 	/**
199 	* @brief Half of pi
200 	*
201 	* Pi divided by two
202 	*/
203 	static const float HALFPI = PI / 2.0f;
204 
205 	/**
206 	* @brief Pi times two
207 	*
208 	* Pi times two
209 	*/
210 	static const float PI2 = PI * 2.0f;
211 
212 	/**
213 	* @brief Four divided by pi
214 	*
215 	* Four over pi
216 	*/
217 	static const float PIU4 = 4.0f / PI;
218 
219 	/**
220 	* @brief Negative four divided by pi squared
221 	*
222 	* Negative four over (pi squared)
223 	*/
224 	static const float PISUN4 = -4.0f / (PI * PI);
225 
226 	/**
227 	* @brief reciprocal of pi
228 	*
229 	* One over (pi times two)
230 	*/
231 	static const float INVPI2 = 1.0f / PI2;
232 
233 	/**
234 	* @brief negative half pi
235 	*
236 	* -pi / 2
237 	*/
238 	static const float NEGHALFPI = -HALFPI;
239 
240 	/**
241 	* @brief calculates the sine of x
242 	*
243 	* Range reduces x to -PI ... PI, and then uses the
244 	* sine approximation method as described at
245 	* http://www.devmaster.net/forums/showthread.php?t=5784
246 	*
247 	* Average percentage error: 0.15281632393574715%
248 	* Highest percentage error: 0.17455324009559584%
249 	*/
sin(float x)250 	inline float sin(float x) {
251 		/* range reduce to -PI ... PI, as the approximation
252 		method only works well for that range. */
253 		x = x - ((int)(x * INVPI2)) * PI2;
254 		if (x > HALFPI) {
255 			x = -x + PI;
256 		} else if (x < NEGHALFPI ) {
257 			x = -x - PI;
258 		}
259 		/* approximation */
260 		x = (PIU4) * x + (PISUN4) * x * math::fabs(x);
261 		x = 0.225f * (x * math::fabs(x) - x) + x;
262 		return x;
263 	}
264 
265 	/**
266 	* @brief calculates the cosine of x
267 	*
268 	* Adds half of pi to x and then uses the sine method.
269 	*/
cos(float x)270 	inline float cos(float x) {
271 		return sin(x + HALFPI);
272 	}
273 
274 
275 	/**
276 	* @brief fast version of std::floor
277 	*
278 	* Like 2-3x faster than glibc ones.
279 	* Note: The results differ at the end of the 32bit precision range.
280 	*/
281 	template<typename T>
floor(const T & f)282 	inline float floor(const T& f)
283 	{
284 		return (f >= 0) ? int(f) : int(f+0.000001f)-1;
285 	}
286 }
287 
288 using fastmath::PI;
289 namespace math {
290 	// override streflop with faster sqrt!
291 	float sqrt(float x) _const;
sqrt(float x)292 	inline float sqrt(float x) {
293 		return fastmath::sqrt_sse(x);
294 	}
295 
296 	using fastmath::isqrt;
297 	using fastmath::isqrt2;
298 
299 	using fastmath::floor;
300 }
301 
302 #endif
303