1 /*
2  * Copyright (C) 2007-2016 Paul Davis <paul@linuxaudiosystems.com>
3  * Copyright (C) 2009-2012 David Robillard <d@drobilla.net>
4  * Copyright (C) 2013-2015 John Emmas <john@creativepost.co.uk>
5  * Copyright (C) 2015-2019 Robin Gareus <robin@gareus.org>
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 2 of the License, or
10  * (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License along
18  * with this program; if not, write to the Free Software Foundation, Inc.,
19  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
20  */
21 
22 #include "libpbd-config.h"
23 
24 #define _XOPEN_SOURCE 600
25 #include <cstring> // for memset
26 #include <cstdlib>
27 #include <stdint.h>
28 #include <assert.h>
29 
30 #ifdef PLATFORM_WINDOWS
31 #include <intrin.h>
32 #endif
33 
34 #ifdef ARM_NEON_SUPPORT
35 /* Needed for ARM NEON detection */
36 #include <sys/auxv.h>
37 #include <asm/hwcap.h>
38 #endif
39 
40 #include "pbd/compose.h"
41 #include "pbd/fpu.h"
42 #include "pbd/error.h"
43 
44 #include "pbd/i18n.h"
45 
46 using namespace PBD;
47 using namespace std;
48 
49 FPU* FPU::_instance (0);
50 
51 #if ( (defined __x86_64__) || (defined __i386__) || (defined _M_X64) || (defined _M_IX86) ) // ARCH_X86
52 #ifndef PLATFORM_WINDOWS
53 
54 /* use __cpuid() as the name to match the MSVC/mingw intrinsic */
55 
56 static void
__cpuid(int regs[4],int cpuid_leaf)57 __cpuid(int regs[4], int cpuid_leaf)
58 {
59 	asm volatile (
60 #if defined(__i386__)
61 			"pushl %%ebx;\n\t"
62 #endif
63 			"cpuid;\n\t"
64 			"movl %%eax, (%1);\n\t"
65 			"movl %%ebx, 4(%1);\n\t"
66 			"movl %%ecx, 8(%1);\n\t"
67 			"movl %%edx, 12(%1);\n\t"
68 #if defined(__i386__)
69 			"popl %%ebx;\n\t"
70 #endif
71 			:"=a" (cpuid_leaf) /* %eax clobbered by CPUID */
72 			:"S" (regs), "a" (cpuid_leaf)
73 			:
74 #if !defined(__i386__)
75 			"%ebx",
76 #endif
77 			"%ecx", "%edx", "memory");
78 }
79 
80 #endif /* !PLATFORM_WINDOWS */
81 
82 #ifndef HAVE_XGETBV // Allow definition by build system
83 	#if defined(__MINGW32__) && defined(__MINGW64_VERSION_MAJOR) && __MINGW64_VERSION_MAJOR >= 5
84 		#define HAVE_XGETBV
85 	#elif defined(_MSC_VER) && _MSC_VER >= 1600
86 		// '_xgetbv()' was only available from VC10 onwards
87 		#define HAVE_XGETBV
88 	#endif
89 #endif
90 
91 #ifndef HAVE_XGETBV
92 
93 #ifdef COMPILER_MSVC
94 
95 // '_xgetbv()' was only available from VC10 onwards
96 __declspec(noinline) static uint64_t
_xgetbv(uint32_t xcr)97 _xgetbv (uint32_t xcr)
98 {
99 	return 0;
100 
101 	// N.B.  The following would probably work for a pre-VC10 build,
102 	// although it might suffer from optimization issues.  We'd need
103 	// to place this function into its own (unoptimized) source file.
104 	__asm {
105 			 mov ecx, [xcr]
106 			 __asm _emit 0x0f __asm _emit 0x01 __asm _emit 0xd0 /*xgetbv*/
107 	}
108 }
109 
110 #else
111 
112 static uint64_t
_xgetbv(uint32_t xcr)113 _xgetbv (uint32_t xcr)
114 {
115 #ifdef __APPLE__
116 	/* it would be nice to make this work on OS X but as long we use veclib,
117 	   we don't really need to know about SSE/AVX on that platform.
118 	*/
119 	return 0;
120 #else
121 	uint32_t eax, edx;
122 	__asm__ volatile ("xgetbv" : "=a" (eax), "=d" (edx) : "c" (xcr));
123 	return (static_cast<uint64_t>(edx) << 32) | eax;
124 #endif
125 }
126 
127 #endif /* !COMPILER_MSVC */
128 #endif /* !HAVE_XGETBV */
129 #endif /* ARCH_X86 */
130 
131 #ifndef _XCR_XFEATURE_ENABLED_MASK
132 #define _XCR_XFEATURE_ENABLED_MASK 0
133 #endif
134 
135 FPU*
instance()136 FPU::instance()
137 {
138 	if (!_instance) {
139 		_instance = new FPU;
140 	}
141 
142 	return _instance;
143 }
144 
145 void
destroy()146 FPU::destroy ()
147 {
148 	delete _instance;
149 	_instance = 0;
150 }
151 
FPU()152 FPU::FPU ()
153 	: _flags ((Flags) 0)
154 {
155 	if (_instance) {
156 		error << _("FPU object instantiated more than once") << endmsg;
157 	}
158 
159 	if (getenv("ARDOUR_FPU_FLAGS")) {
160 		_flags = Flags (atoi (getenv("ARDOUR_FPU_FLAGS")));
161 		return;
162 	}
163 
164 #ifdef ARM_NEON_SUPPORT
165 # ifdef __aarch64__
166 	/* all armv8+ features NEON used in arm_neon_functions.cc */
167 	_flags = Flags(_flags | HasNEON);
168 # elif defined __arm__
169 	if (getauxval(AT_HWCAP) & HWCAP_NEON) {
170 		_flags = Flags(_flags | HasNEON);
171 	}
172 # endif
173 #endif
174 
175 #if !( (defined __x86_64__) || (defined __i386__) || (defined _M_X64) || (defined _M_IX86) ) // !ARCH_X86
176 	/* Non-Intel architecture, nothing to do here */
177 	return;
178 #else
179 
180 	/* Get the CPU vendor just for kicks
181 	 *
182 	 * __cpuid with an InfoType argument of 0 returns the number of
183 	 * valid Ids in CPUInfo[0] and the CPU identification string in
184 	 * the other three array elements. The CPU identification string is
185 	 * not in linear order. The code below arranges the information
186 	 * in a human readable form. The human readable order is CPUInfo[1] |
187 	 * CPUInfo[3] | CPUInfo[2]. CPUInfo[2] and CPUInfo[3] are swapped
188 	 * before using memcpy to copy these three array elements to cpu_string.
189 	 */
190 
191 	int cpu_info[4];
192 	char cpu_string[48];
193 	string cpu_vendor;
194 
195 	__cpuid (cpu_info, 0);
196 
197 	int num_ids = cpu_info[0];
198 	std::swap(cpu_info[2], cpu_info[3]);
199 	memcpy(cpu_string, &cpu_info[1], 3 * sizeof(cpu_info[1]));
200 	cpu_vendor.assign(cpu_string, 3 * sizeof(cpu_info[1]));
201 
202 	info << string_compose (_("CPU vendor: %1"), cpu_vendor) << endmsg;
203 
204 	if (num_ids > 0) {
205 
206 		/* Now get CPU/FPU flags */
207 
208 		__cpuid (cpu_info, 1);
209 
210 		if ((cpu_info[2] & (1<<27)) /* OSXSAVE */ &&
211 		    (cpu_info[2] & (1<<28) /* AVX */) &&
212 		    ((_xgetbv (_XCR_XFEATURE_ENABLED_MASK) & 0x6) == 0x6)) { /* OS really supports XSAVE */
213 			info << _("AVX-capable processor") << endmsg;
214 			_flags = Flags (_flags | (HasAVX));
215 		}
216 
217 		if (cpu_info[2] & (1<<12) /* FMA */) {
218 			info << _("AVX with FMA capable processor") << endmsg;
219 			_flags = Flags (_flags | (HasFMA));
220 		}
221 
222 		if (cpu_info[3] & (1<<25)) {
223 			_flags = Flags (_flags | (HasSSE|HasFlushToZero));
224 		}
225 
226 		if (cpu_info[3] & (1<<26)) {
227 			_flags = Flags (_flags | HasSSE2);
228 		}
229 
230 		/* Figure out CPU/FPU denormal handling capabilities */
231 
232 		if (cpu_info[3] & (1 << 24)) {
233 
234 			char** fxbuf = 0;
235 
236 			/* DAZ wasn't available in the first version of SSE. Since
237 			   setting a reserved bit in MXCSR causes a general protection
238 			   fault, we need to be able to check the availability of this
239 			   feature without causing problems. To do this, one needs to
240 			   set up a 512-byte area of memory to save the SSE state to,
241 			   using fxsave, and then one needs to inspect bytes 28 through
242 			   31 for the MXCSR_MASK value. If bit 6 is set, DAZ is
243 			   supported, otherwise, it isn't.
244 			*/
245 
246 #ifndef HAVE_POSIX_MEMALIGN
247 #  ifdef PLATFORM_WINDOWS
248 			fxbuf = (char **) _aligned_malloc (sizeof (char *), 16);
249 			assert (fxbuf);
250 			*fxbuf = (char *) _aligned_malloc (512, 16);
251 			assert (*fxbuf);
252 #  else
253 #  warning using default malloc for aligned memory
254 			fxbuf = (char **) malloc (sizeof (char *));
255 			assert (fxbuf);
256 			*fxbuf = (char *) malloc (512);
257 			assert (*fxbuf);
258 #  endif
259 #else
260 			(void) posix_memalign ((void **) &fxbuf, 16, sizeof (char *));
261 			assert (fxbuf);
262 			(void) posix_memalign ((void **) fxbuf, 16, 512);
263 			assert (*fxbuf);
264 #endif
265 
266 			memset (*fxbuf, 0, 512);
267 
268 #ifdef COMPILER_MSVC
269 			char* buf = *fxbuf;
270 #ifdef _WIN64
271 			/* For 64-bit compilation, MSVC doesn't support inline assembly !!
272 			   ( https://docs.microsoft.com/en-us/cpp/assembler/inline/inline-assembler?view=msvc-160 ) */
273 
274 			/* but instead, it uses something called 'x64 intrinsics'
275 			   1: ( https://docs.microsoft.com/en-us/cpp/intrinsics/x64-amd64-intrinsics-list?view=msvc-160 )
276 			   2: ( https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_fxsave ) */
277 			_fxsave (buf);
278 #else
279 			__asm {
280 				mov eax, buf
281 					fxsave   [eax]
282 					};
283 #endif
284 #else
285 			asm volatile (
286 				"fxsave (%0)"
287 				:
288 				: "r" (*fxbuf)
289 				: "memory"
290 				);
291 #endif
292 
293 			uint32_t mxcsr_mask = *((uint32_t*) &((*fxbuf)[28]));
294 
295 			/* if the mask is zero, set its default value (from intel specs) */
296 
297 			if (mxcsr_mask == 0) {
298 				mxcsr_mask = 0xffbf;
299 			}
300 
301 			if (mxcsr_mask & (1<<6)) {
302 				_flags = Flags (_flags | HasDenormalsAreZero);
303 			}
304 
305 #if !defined HAVE_POSIX_MEMALIGN && defined PLATFORM_WINDOWS
306 			_aligned_free (*fxbuf);
307 			_aligned_free (fxbuf);
308 #else
309 			free (*fxbuf);
310 			free (fxbuf);
311 #endif
312 		}
313 
314 		/* finally get the CPU brand */
315 
316 		__cpuid (cpu_info, 0x80000000);
317 
318 		const int parameter_end = 0x80000004;
319 		string cpu_brand;
320 
321 		if (cpu_info[0] >= parameter_end) {
322 			char* cpu_string_ptr = cpu_string;
323 
324 			for (int parameter = 0x80000002; parameter <= parameter_end &&
325 				     cpu_string_ptr < &cpu_string[sizeof(cpu_string)]; parameter++) {
326 				__cpuid(cpu_info, parameter);
327 				memcpy(cpu_string_ptr, cpu_info, sizeof(cpu_info));
328 				cpu_string_ptr += sizeof(cpu_info);
329 			}
330 			cpu_brand.assign(cpu_string, cpu_string_ptr - cpu_string);
331 			info << string_compose (_("CPU brand: %1"), cpu_brand) << endmsg;
332 		}
333 	}
334 #endif /* !ARCH_X86 */
335 }
336 
~FPU()337 FPU::~FPU ()
338 {
339 }
340