1 /* This file is part of the Spring engine (GPL v2 or later), see LICENSE.html */
2 
3 #ifdef USE_VALGRIND
4 	#include <valgrind/valgrind.h>
5 #endif
6 
7 #include "FPUCheck.h"
8 #include "lib/streflop/streflop_cond.h"
9 #include "System/Exceptions.h"
10 #include "System/ThreadPool.h"
11 #include "System/Log/ILog.h"
12 #include "System/Platform/CpuID.h"
13 
14 #ifdef STREFLOP_H
15 	#ifdef STREFLOP_SSE
16 	#elif STREFLOP_X87
17 	#else
18 		#error "streflop FP-math mode must be either SSE or X87"
19 	#endif
20 #endif
21 
22 /**
23 	@brief checks FPU control registers.
24 	Checks the FPU control registers MXCSR and FPUCW,
25 
26 For reference, the layout of the MXCSR register:
27             FZ:RC:RC:PM:UM:OM:ZM:DM:IM: Rsvd:PE:UE:OE:ZE:DE:IE
28             15 14 13 12 11 10  9  8  7|   6   5  4  3  2  1  0
29 Spring1:     0  0  0  1  1  1  0  1  0|   0   0  0  0  0  0  0 = 0x1D00 = 7424
30 Spring2:     0  0  0  1  1  1  1  1  1|   0   0  0  0  0  0  0 = 0x1F80 = 8064
31 Spring3:     0  0  0  1  1  0  0  1  0|   0   0  0  0  0  0  0 = 0x1900 = 6400  (signan)
32 Default:     0  0  0  1  1  1  1  1  1|   0   0  0  0  0  0  0 = 0x1F80 = 8064
33 MaskRsvd:    1  1  1  1  1  1  1  1  1|   0   0  0  0  0  0  0 = 0xFF80
34 
35 And the layout of the 387 FPU control word register:
36            Rsvd:Rsvd:Rsvd:X:RC:RC:PC:PC: Rsvd:Rsvd:PM:UM:OM:ZM:DM:IM
37             15   14   13 12 11 10  9  8|   7    6   5  4  3  2  1  0
38 Spring1:     0    0    0  0  0  0  0  0|   0    0   1  1  1  0  1  0 = 0x003A = 58
39 Spring2:     0    0    0  0  0  0  0  0|   0    0   1  1  1  1  1  1 = 0x003F = 63
40 Spring3:     0    0    0  0  0  0  0  0|   0    0   1  1  0  0  1  0 = 0x0032 = 50   (signan)
41 Default:     0    0    0  0  0  0  1  1|   0    0   1  1  1  1  1  1 = 0x033F = 831
42 MaskRsvd:    0    0    0  1  1  1  1  1|   0    0   1  1  1  1  1  1 = 0x0F3F
43 
44 	Where:
45 		Rsvd - Reserved
46 		FZ   - Flush to Zero
47 		RC   - Rounding Control
48 		PM   - Precision Mask
49 		UM   - Underflow Mask
50 		OM   - Overflow Mask
51 		ZM   - Zerodivide Mask
52 		DM   - Denormal Mask
53 		IM   - Invalid Mask
54 		PE   - Precision Exception
55 		UE   - Underflow Exception
56 		OE   - Overflow Exception
57 		ZE   - Zerodivide Exception
58 		DE   - Denormal Exception
59 		IE   - Invalid Exception
60 		X    - Infinity control (unused on 387 and higher)
61 		PC   - Precision Control
62 
63 		Spring1  - Control word used by spring in code in CGame::SimFrame().
64 		Spring2  - Control word used by spring in code everywhere else.
65 		Default  - Default control word according to Intel.
66 		MaskRsvd - Masks out the reserved bits.
67 
68 	Source: Intel Architecture Software Development Manual, Volume 1, Basic Architecture
69 */
good_fpu_control_registers(const char * text)70 void good_fpu_control_registers(const char* text)
71 {
72 #ifdef USE_VALGRIND
73 	static const bool valgrindRunning = RUNNING_ON_VALGRIND;
74 	if (valgrindRunning) {
75 		// Valgrind doesn't allow us setting the FPU, so syncing is impossible
76 		return;
77 	}
78 #endif
79 
80 	// accepted/syncsafe FPU states:
81 	int sse_a, sse_b, sse_c, x87_a, x87_b, x87_c;
82 	{
83 		sse_a = 0x1D00;
84 		sse_b = 0x1F80;
85 		sse_c = 0x1900; // signan
86 		x87_a = 0x003A;
87 		x87_b = 0x003F;
88 		x87_c = 0x0032; // signan
89 	}
90 
91 #if defined(STREFLOP_SSE)
92 	// struct
93 	streflop::fpenv_t fenv;
94 	streflop::fegetenv(&fenv);
95 
96 	const int fsse = fenv.sse_mode & 0xFF80;
97 	const int fx87 = fenv.x87_mode & 0x0F3F;
98 
99 	bool ret = ((fsse == sse_a) || (fsse == sse_b) || (fsse == sse_c)) &&
100 	           ((fx87 == x87_a) || (fx87 == x87_b) || (fx87 == x87_c));
101 
102 	if (!ret) {
103 		LOG_L(L_WARNING, "[%s] Sync warning: (env.sse_mode) MXCSR 0x%04X instead of 0x%04X or 0x%04X (\"%s\")", __FUNCTION__, fsse, sse_a, sse_b, text);
104 		LOG_L(L_WARNING, "[%s] Sync warning: (env.x87_mode) FPUCW 0x%04X instead of 0x%04X or 0x%04X (\"%s\")", __FUNCTION__, fx87, x87_a, x87_b, text);
105 
106 		// Set single precision floating point math.
107 		streflop::streflop_init<streflop::Simple>();
108 	#if defined(__SUPPORT_SNAN__)
109 		streflop::feraiseexcept(streflop::FPU_Exceptions(streflop::FE_INVALID | streflop::FE_DIVBYZERO | streflop::FE_OVERFLOW));
110 	#endif
111 	}
112 
113 #elif defined(STREFLOP_X87)
114 	// short int
115 	streflop::fpenv_t fenv;
116 	streflop::fegetenv(&fenv);
117 
118 	bool ret = (fenv & 0x0F3F) == x87_a || (fenv & 0x0F3F) == x87_b || (fenv & 0x0F3F) == x87_c;
119 
120 	if (!ret) {
121 		LOG_L(L_WARNING, "[%s] Sync warning: FPUCW 0x%04X instead of 0x%04X or 0x%04X (\"%s\")", __FUNCTION__, fenv, x87_a, x87_b, text);
122 
123 		// Set single precision floating point math.
124 		streflop::streflop_init<streflop::Simple>();
125 	#if defined(__SUPPORT_SNAN__)
126 		streflop::feraiseexcept(streflop::FPU_Exceptions(streflop::FE_INVALID | streflop::FE_DIVBYZERO | streflop::FE_OVERFLOW));
127 	#endif
128 	}
129 #endif
130 }
131 
good_fpu_init()132 void good_fpu_init()
133 {
134 	const unsigned int sseBits = springproc::GetProcSSEBits();
135 		LOG("[CMyMath::Init] CPU SSE mask: %u, flags:", sseBits);
136 		LOG("\tSSE 1.0:  %d,  SSE 2.0:  %d", (sseBits >> 5) & 1, (sseBits >> 4) & 1);
137 		LOG("\tSSE 3.0:  %d, SSSE 3.0:  %d", (sseBits >> 3) & 1, (sseBits >> 2) & 1);
138 		LOG("\tSSE 4.1:  %d,  SSE 4.2:  %d", (sseBits >> 1) & 1, (sseBits >> 0) & 1);
139 		LOG("\tSSE 4.0A: %d,  SSE 5.0A: %d", (sseBits >> 8) & 1, (sseBits >> 7) & 1);
140 
141 #ifdef STREFLOP_H
142 	const bool hasSSE1 = (sseBits >> 5) & 1;
143 
144 	#ifdef STREFLOP_SSE
145 	if (hasSSE1) {
146 		LOG("\tusing streflop SSE FP-math mode, CPU supports SSE instructions");
147 	} else {
148 		throw unsupported_error("CPU is missing SSE instruction support");
149 	}
150 	#else
151 	if (hasSSE1) {
152 		LOG_L(L_WARNING, "\tStreflop floating-point math is set to X87 mode");
153 		LOG_L(L_WARNING, "\tThis may cause desyncs during multi-player games");
154 		LOG_L(L_WARNING, "\tThis CPU is SSE-capable; consider recompiling");
155 	} else {
156 		LOG_L(L_WARNING, "\tStreflop floating-point math is not SSE-enabled");
157 		LOG_L(L_WARNING, "\tThis may cause desyncs during multi-player games");
158 		LOG_L(L_WARNING, "\tThis CPU is not SSE-capable; it can only use X87 mode");
159 	}
160 	#endif
161 
162 	// Set single precision floating point math.
163 	streflop::streflop_init<streflop::Simple>();
164 	#if defined(__SUPPORT_SNAN__)
165 		streflop::feraiseexcept(streflop::FPU_Exceptions(streflop::FE_INVALID | streflop::FE_DIVBYZERO | streflop::FE_OVERFLOW));
166 	#endif
167 
168 #else
169 	// probably should check if SSE was enabled during
170 	// compilation and issue a warning about illegal
171 	// instructions if so (or just die with an error)
172 	LOG_L(L_WARNING, "Floating-point math is not controlled by streflop");
173 	LOG_L(L_WARNING, "This makes keeping multi-player sync 99% impossible");
174 #endif
175 }
176 
streflop_init_omp()177 void streflop_init_omp() {
178 	// Initialize FPU in all worker threads, too
179 	// Note: It's not needed for sync'ness cause all precision relevant
180 	//       mode flags are shared across the process!
181 	//       But the exception ones aren't (but are copied from the calling thread).
182 	parallel([&]{
183 		streflop::streflop_init<streflop::Simple>();
184 	#if defined(__SUPPORT_SNAN__)
185 		streflop::feraiseexcept(streflop::FPU_Exceptions(streflop::FE_INVALID | streflop::FE_DIVBYZERO | streflop::FE_OVERFLOW));
186 	#endif
187 	});
188 }
189 
190 namespace springproc {
GetProcMaxStandardLevel()191 	unsigned int GetProcMaxStandardLevel()
192 	{
193 		unsigned int rEAX = 0x00000000;
194 		unsigned int rEBX =          0;
195 		unsigned int rECX =          0;
196 		unsigned int rEDX =          0;
197 
198 		ExecCPUID(&rEAX, &rEBX, &rECX, &rEDX);
199 
200 		return rEAX;
201 	}
202 
GetProcMaxExtendedLevel()203 	unsigned int GetProcMaxExtendedLevel()
204 	{
205 		unsigned int rEAX = 0x80000000;
206 		unsigned int rEBX =          0;
207 		unsigned int rECX =          0;
208 		unsigned int rEDX =          0;
209 
210 		ExecCPUID(&rEAX, &rEBX, &rECX, &rEDX);
211 
212 		return rEAX;
213 	}
214 
GetProcSSEBits()215 	unsigned int GetProcSSEBits()
216 	{
217 		unsigned int rEAX = 0;
218 		unsigned int rEBX = 0;
219 		unsigned int rECX = 0;
220 		unsigned int rEDX = 0;
221 		unsigned int bits = 0;
222 
223 		if (GetProcMaxStandardLevel() >= 0x00000001U) {
224 			rEAX = 0x00000001U; ExecCPUID(&rEAX, &rEBX, &rECX, &rEDX);
225 
226 			int SSE42  = (rECX >> 20) & 1; bits |= ( SSE42 << 0); // SSE 4.2
227 			int SSE41  = (rECX >> 19) & 1; bits |= ( SSE41 << 1); // SSE 4.1
228 			int SSSE30 = (rECX >>  9) & 1; bits |= (SSSE30 << 2); // Supplemental SSE 3.0
229 			int SSE30  = (rECX >>  0) & 1; bits |= ( SSE30 << 3); // SSE 3.0
230 
231 			int SSE20  = (rEDX >> 26) & 1; bits |= ( SSE20 << 4); // SSE 2.0
232 			int SSE10  = (rEDX >> 25) & 1; bits |= ( SSE10 << 5); // SSE 1.0
233 			int MMX    = (rEDX >> 23) & 1; bits |= ( MMX   << 6); // MMX
234 		}
235 
236 		if (GetProcMaxExtendedLevel() >= 0x80000001U) {
237 			rEAX = 0x80000001U; ExecCPUID(&rEAX, &rEBX, &rECX, &rEDX);
238 
239 			int SSE50A = (rECX >> 11) & 1; bits |= (SSE50A << 7); // SSE 5.0A
240 			int SSE40A = (rECX >>  6) & 1; bits |= (SSE40A << 8); // SSE 4.0A
241 			int MSSE   = (rECX >>  7) & 1; bits |= (MSSE   << 9); // Misaligned SSE
242 		}
243 
244 		return bits;
245 	}
246 }
247