1 /*******************************************************************************
2 * Copyright 2016-2020 Intel Corporation
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *     http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 *******************************************************************************/
16 
17 /*******************************************************************************
18 * Copyright (c) 2007 MITSUNARI Shigeo
19 * All rights reserved.
20 *
21 * Redistribution and use in source and binary forms, with or without
22 * modification, are permitted provided that the following conditions are met:
23 *
24 * Redistributions of source code must retain the above copyright notice, this
25 * list of conditions and the following disclaimer.
26 * Redistributions in binary form must reproduce the above copyright notice,
27 * this list of conditions and the following disclaimer in the documentation
28 * and/or other materials provided with the distribution.
29 * Neither the name of the copyright owner nor the names of its contributors may
30 * be used to endorse or promote products derived from this software without
31 * specific prior written permission.
32 *
33 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
34 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
35 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
36 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
37 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
38 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
39 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
40 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
41 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
42 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
43 * THE POSSIBILITY OF SUCH DAMAGE.
44 *******************************************************************************/
45 
46 #ifndef XBYAK_XBYAK_UTIL_H_
47 #define XBYAK_XBYAK_UTIL_H_
48 
49 #ifdef XBYAK_ONLY_CLASS_CPU
50 #include <stdint.h>
51 #include <stdlib.h>
52 #include <algorithm>
53 #include <assert.h>
54 #ifndef XBYAK_THROW
55 	#define XBYAK_THROW(x) ;
56 	#define XBYAK_THROW_RET(x, y) return y;
57 #endif
58 #else
59 #include <string.h>
60 
61 /**
62 	utility class and functions for Xbyak
63 	Xbyak::util::Clock ; rdtsc timer
64 	Xbyak::util::Cpu ; detect CPU
65 	@note this header is UNDER CONSTRUCTION!
66 */
67 #include "xbyak.h"
68 #endif // XBYAK_ONLY_CLASS_CPU
69 
70 #if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
71 	#define XBYAK_INTEL_CPU_SPECIFIC
72 #endif
73 
74 #ifdef XBYAK_INTEL_CPU_SPECIFIC
75 #ifdef _MSC_VER
76 	#if (_MSC_VER < 1400) && defined(XBYAK32)
__cpuid(int[4],int)77 		static inline __declspec(naked) void __cpuid(int[4], int)
78 		{
79 			__asm {
80 				push	ebx
81 				push	esi
82 				mov		eax, dword ptr [esp + 4 * 2 + 8] // eaxIn
83 				cpuid
84 				mov		esi, dword ptr [esp + 4 * 2 + 4] // data
85 				mov		dword ptr [esi], eax
86 				mov		dword ptr [esi + 4], ebx
87 				mov		dword ptr [esi + 8], ecx
88 				mov		dword ptr [esi + 12], edx
89 				pop		esi
90 				pop		ebx
91 				ret
92 			}
93 		}
94 	#else
95 		#include <intrin.h> // for __cpuid
96 	#endif
97 #else
98 	#ifndef __GNUC_PREREQ
99     	#define __GNUC_PREREQ(major, minor) ((((__GNUC__) << 16) + (__GNUC_MINOR__)) >= (((major) << 16) + (minor)))
100 	#endif
101 	#if __GNUC_PREREQ(4, 3) && !defined(__APPLE__)
102 		#include <cpuid.h>
103 	#else
104 		#if defined(__APPLE__) && defined(XBYAK32) // avoid err : can't find a register in class `BREG' while reloading `asm'
105 			#define __cpuid(eaxIn, a, b, c, d) __asm__ __volatile__("pushl %%ebx\ncpuid\nmovl %%ebp, %%esi\npopl %%ebx" : "=a"(a), "=S"(b), "=c"(c), "=d"(d) : "0"(eaxIn))
106 			#define __cpuid_count(eaxIn, ecxIn, a, b, c, d) __asm__ __volatile__("pushl %%ebx\ncpuid\nmovl %%ebp, %%esi\npopl %%ebx" : "=a"(a), "=S"(b), "=c"(c), "=d"(d) : "0"(eaxIn), "2"(ecxIn))
107 		#else
108 			#define __cpuid(eaxIn, a, b, c, d) __asm__ __volatile__("cpuid\n" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "0"(eaxIn))
109 			#define __cpuid_count(eaxIn, ecxIn, a, b, c, d) __asm__ __volatile__("cpuid\n" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "0"(eaxIn), "2"(ecxIn))
110 		#endif
111 	#endif
112 #endif
113 #endif
114 
115 #ifdef XBYAK_USE_VTUNE
116 	// -I /opt/intel/vtune_amplifier/include/ -L /opt/intel/vtune_amplifier/lib64 -ljitprofiling -ldl
117 	#include <jitprofiling.h>
118 	#ifdef _MSC_VER
119 		#pragma comment(lib, "libittnotify.lib")
120 	#endif
121 	#ifdef __linux__
122 		#include <dlfcn.h>
123 	#endif
124 #endif
125 #ifdef __linux__
126 	#define XBYAK_USE_PERF
127 #endif
128 
129 namespace Xbyak { namespace util {
130 
131 typedef enum {
132    SmtLevel = 1,
133    CoreLevel = 2
134 } IntelCpuTopologyLevel;
135 
136 /**
137 	CPU detection class
138 */
139 class Cpu {
140 	uint64_t type_;
141 	//system topology
142 	bool x2APIC_supported_;
143 	static const size_t maxTopologyLevels = 2;
144 	unsigned int numCores_[maxTopologyLevels];
145 
146 	static const unsigned int maxNumberCacheLevels = 10;
147 	unsigned int dataCacheSize_[maxNumberCacheLevels];
148 	unsigned int coresSharignDataCache_[maxNumberCacheLevels];
149 	unsigned int dataCacheLevels_;
150 
get32bitAsBE(const char * x)151 	unsigned int get32bitAsBE(const char *x) const
152 	{
153 		return x[0] | (x[1] << 8) | (x[2] << 16) | (x[3] << 24);
154 	}
mask(int n)155 	unsigned int mask(int n) const
156 	{
157 		return (1U << n) - 1;
158 	}
setFamily()159 	void setFamily()
160 	{
161 		unsigned int data[4] = {};
162 		getCpuid(1, data);
163 		stepping = data[0] & mask(4);
164 		model = (data[0] >> 4) & mask(4);
165 		family = (data[0] >> 8) & mask(4);
166 		// type = (data[0] >> 12) & mask(2);
167 		extModel = (data[0] >> 16) & mask(4);
168 		extFamily = (data[0] >> 20) & mask(8);
169 		if (family == 0x0f) {
170 			displayFamily = family + extFamily;
171 		} else {
172 			displayFamily = family;
173 		}
174 		if (family == 6 || family == 0x0f) {
175 			displayModel = (extModel << 4) + model;
176 		} else {
177 			displayModel = model;
178 		}
179 	}
extractBit(unsigned int val,unsigned int base,unsigned int end)180 	unsigned int extractBit(unsigned int val, unsigned int base, unsigned int end)
181 	{
182 		return (val >> base) & ((1u << (end - base)) - 1);
183 	}
setNumCores()184 	void setNumCores()
185 	{
186 		if ((type_ & tINTEL) == 0) return;
187 
188 		unsigned int data[4] = {};
189 
190 		 /* CAUTION: These numbers are configuration as shipped by Intel. */
191 		getCpuidEx(0x0, 0, data);
192 		if (data[0] >= 0xB) {
193 			 /*
194 				if leaf 11 exists(x2APIC is supported),
195 				we use it to get the number of smt cores and cores on socket
196 
197 				leaf 0xB can be zeroed-out by a hypervisor
198 			*/
199 			x2APIC_supported_ = true;
200 			for (unsigned int i = 0; i < maxTopologyLevels; i++) {
201 				getCpuidEx(0xB, i, data);
202 				IntelCpuTopologyLevel level = (IntelCpuTopologyLevel)extractBit(data[2], 8, 15);
203 				if (level == SmtLevel || level == CoreLevel) {
204 					numCores_[level - 1] = extractBit(data[1], 0, 15);
205 				}
206 			}
207 			/*
208 				Fallback values in case a hypervisor has 0xB leaf zeroed-out.
209 			*/
210 			numCores_[SmtLevel - 1] = (std::max)(1u, numCores_[SmtLevel - 1]);
211 			numCores_[CoreLevel - 1] = (std::max)(numCores_[SmtLevel - 1], numCores_[CoreLevel - 1]);
212 		} else {
213 			/*
214 				Failed to deremine num of cores without x2APIC support.
215 				TODO: USE initial APIC ID to determine ncores.
216 			*/
217 			numCores_[SmtLevel - 1] = 0;
218 			numCores_[CoreLevel - 1] = 0;
219 		}
220 
221 	}
setCacheHierarchy()222 	void setCacheHierarchy()
223 	{
224 		if ((type_ & tINTEL) == 0) return;
225 		const unsigned int NO_CACHE = 0;
226 		const unsigned int DATA_CACHE = 1;
227 //		const unsigned int INSTRUCTION_CACHE = 2;
228 		const unsigned int UNIFIED_CACHE = 3;
229 		unsigned int smt_width = 0;
230 		unsigned int logical_cores = 0;
231 		unsigned int data[4] = {};
232 
233 		if (x2APIC_supported_) {
234 			smt_width = numCores_[0];
235 			logical_cores = numCores_[1];
236 		}
237 
238 		/*
239 			Assumptions:
240 			the first level of data cache is not shared (which is the
241 			case for every existing architecture) and use this to
242 			determine the SMT width for arch not supporting leaf 11.
243 			when leaf 4 reports a number of core less than numCores_
244 			on socket reported by leaf 11, then it is a correct number
245 			of cores not an upperbound.
246 		*/
247 		for (int i = 0; dataCacheLevels_ < maxNumberCacheLevels; i++) {
248 			getCpuidEx(0x4, i, data);
249 			unsigned int cacheType = extractBit(data[0], 0, 4);
250 			if (cacheType == NO_CACHE) break;
251 			if (cacheType == DATA_CACHE || cacheType == UNIFIED_CACHE) {
252 				unsigned int actual_logical_cores = extractBit(data[0], 14, 25) + 1;
253 				if (logical_cores != 0) { // true only if leaf 0xB is supported and valid
254 					actual_logical_cores = (std::min)(actual_logical_cores, logical_cores);
255 				}
256 				assert(actual_logical_cores != 0);
257 				dataCacheSize_[dataCacheLevels_] =
258 					(extractBit(data[1], 22, 31) + 1)
259 					* (extractBit(data[1], 12, 21) + 1)
260 					* (extractBit(data[1], 0, 11) + 1)
261 					* (data[2] + 1);
262 				if (cacheType == DATA_CACHE && smt_width == 0) smt_width = actual_logical_cores;
263 				assert(smt_width != 0);
264 				coresSharignDataCache_[dataCacheLevels_] = (std::max)(actual_logical_cores / smt_width, 1u);
265 				dataCacheLevels_++;
266 			}
267 		}
268 	}
269 
270 public:
271 	int model;
272 	int family;
273 	int stepping;
274 	int extModel;
275 	int extFamily;
276 	int displayFamily; // family + extFamily
277 	int displayModel; // model + extModel
278 
getNumCores(IntelCpuTopologyLevel level)279 	unsigned int getNumCores(IntelCpuTopologyLevel level) const {
280 		if (!x2APIC_supported_) XBYAK_THROW_RET(ERR_X2APIC_IS_NOT_SUPPORTED, 0)
281 		switch (level) {
282 		case SmtLevel: return numCores_[level - 1];
283 		case CoreLevel: return numCores_[level - 1] / numCores_[SmtLevel - 1];
284 		default: XBYAK_THROW_RET(ERR_X2APIC_IS_NOT_SUPPORTED, 0)
285 		}
286 	}
287 
getDataCacheLevels()288 	unsigned int getDataCacheLevels() const { return dataCacheLevels_; }
getCoresSharingDataCache(unsigned int i)289 	unsigned int getCoresSharingDataCache(unsigned int i) const
290 	{
291 		if (i >= dataCacheLevels_) XBYAK_THROW_RET(ERR_BAD_PARAMETER, 0)
292 		return coresSharignDataCache_[i];
293 	}
getDataCacheSize(unsigned int i)294 	unsigned int getDataCacheSize(unsigned int i) const
295 	{
296 		if (i >= dataCacheLevels_) XBYAK_THROW_RET(ERR_BAD_PARAMETER, 0)
297 		return dataCacheSize_[i];
298 	}
299 
300 	/*
301 		data[] = { eax, ebx, ecx, edx }
302 	*/
getCpuid(unsigned int eaxIn,unsigned int data[4])303 	static inline void getCpuid(unsigned int eaxIn, unsigned int data[4])
304 	{
305 #ifdef XBYAK_INTEL_CPU_SPECIFIC
306 	#ifdef _MSC_VER
307 		__cpuid(reinterpret_cast<int*>(data), eaxIn);
308 	#else
309 		__cpuid(eaxIn, data[0], data[1], data[2], data[3]);
310 	#endif
311 #else
312 		(void)eaxIn;
313 		(void)data;
314 #endif
315 	}
getCpuidEx(unsigned int eaxIn,unsigned int ecxIn,unsigned int data[4])316 	static inline void getCpuidEx(unsigned int eaxIn, unsigned int ecxIn, unsigned int data[4])
317 	{
318 #ifdef XBYAK_INTEL_CPU_SPECIFIC
319 	#ifdef _MSC_VER
320 		__cpuidex(reinterpret_cast<int*>(data), eaxIn, ecxIn);
321 	#else
322 		__cpuid_count(eaxIn, ecxIn, data[0], data[1], data[2], data[3]);
323 	#endif
324 #else
325 		(void)eaxIn;
326 		(void)ecxIn;
327 		(void)data;
328 #endif
329 	}
getXfeature()330 	static inline uint64_t getXfeature()
331 	{
332 #ifdef XBYAK_INTEL_CPU_SPECIFIC
333 	#ifdef _MSC_VER
334 		return _xgetbv(0);
335 	#else
336 		unsigned int eax, edx;
337 		// xgetvb is not support on gcc 4.2
338 //		__asm__ volatile("xgetbv" : "=a"(eax), "=d"(edx) : "c"(0));
339 		__asm__ volatile(".byte 0x0f, 0x01, 0xd0" : "=a"(eax), "=d"(edx) : "c"(0));
340 		return ((uint64_t)edx << 32) | eax;
341 	#endif
342 #else
343 		return 0;
344 #endif
345 	}
346 	typedef uint64_t Type;
347 
348 	static const Type NONE = 0;
349 	static const Type tMMX = 1 << 0;
350 	static const Type tMMX2 = 1 << 1;
351 	static const Type tCMOV = 1 << 2;
352 	static const Type tSSE = 1 << 3;
353 	static const Type tSSE2 = 1 << 4;
354 	static const Type tSSE3 = 1 << 5;
355 	static const Type tSSSE3 = 1 << 6;
356 	static const Type tSSE41 = 1 << 7;
357 	static const Type tSSE42 = 1 << 8;
358 	static const Type tPOPCNT = 1 << 9;
359 	static const Type tAESNI = 1 << 10;
360 	static const Type tSSE5 = 1 << 11;
361 	static const Type tOSXSAVE = 1 << 12;
362 	static const Type tPCLMULQDQ = 1 << 13;
363 	static const Type tAVX = 1 << 14;
364 	static const Type tFMA = 1 << 15;
365 
366 	static const Type t3DN = 1 << 16;
367 	static const Type tE3DN = 1 << 17;
368 	static const Type tSSE4a = 1 << 18;
369 	static const Type tRDTSCP = 1 << 19;
370 	static const Type tAVX2 = 1 << 20;
371 	static const Type tBMI1 = 1 << 21; // andn, bextr, blsi, blsmsk, blsr, tzcnt
372 	static const Type tBMI2 = 1 << 22; // bzhi, mulx, pdep, pext, rorx, sarx, shlx, shrx
373 	static const Type tLZCNT = 1 << 23;
374 
375 	static const Type tINTEL = 1 << 24;
376 	static const Type tAMD = 1 << 25;
377 
378 	static const Type tENHANCED_REP = 1 << 26; // enhanced rep movsb/stosb
379 	static const Type tRDRAND = 1 << 27;
380 	static const Type tADX = 1 << 28; // adcx, adox
381 	static const Type tRDSEED = 1 << 29; // rdseed
382 	static const Type tSMAP = 1 << 30; // stac
383 	static const Type tHLE = uint64_t(1) << 31; // xacquire, xrelease, xtest
384 	static const Type tRTM = uint64_t(1) << 32; // xbegin, xend, xabort
385 	static const Type tF16C = uint64_t(1) << 33; // vcvtph2ps, vcvtps2ph
386 	static const Type tMOVBE = uint64_t(1) << 34; // mobve
387 	static const Type tAVX512F = uint64_t(1) << 35;
388 	static const Type tAVX512DQ = uint64_t(1) << 36;
389 	static const Type tAVX512_IFMA = uint64_t(1) << 37;
390 	static const Type tAVX512IFMA = tAVX512_IFMA;
391 	static const Type tAVX512PF = uint64_t(1) << 38;
392 	static const Type tAVX512ER = uint64_t(1) << 39;
393 	static const Type tAVX512CD = uint64_t(1) << 40;
394 	static const Type tAVX512BW = uint64_t(1) << 41;
395 	static const Type tAVX512VL = uint64_t(1) << 42;
396 	static const Type tAVX512_VBMI = uint64_t(1) << 43;
397 	static const Type tAVX512VBMI = tAVX512_VBMI; // changed by Intel's manual
398 	static const Type tAVX512_4VNNIW = uint64_t(1) << 44;
399 	static const Type tAVX512_4FMAPS = uint64_t(1) << 45;
400 	static const Type tPREFETCHWT1 = uint64_t(1) << 46;
401 	static const Type tPREFETCHW = uint64_t(1) << 47;
402 	static const Type tSHA = uint64_t(1) << 48;
403 	static const Type tMPX = uint64_t(1) << 49;
404 	static const Type tAVX512_VBMI2 = uint64_t(1) << 50;
405 	static const Type tGFNI = uint64_t(1) << 51;
406 	static const Type tVAES = uint64_t(1) << 52;
407 	static const Type tVPCLMULQDQ = uint64_t(1) << 53;
408 	static const Type tAVX512_VNNI = uint64_t(1) << 54;
409 	static const Type tAVX512_BITALG = uint64_t(1) << 55;
410 	static const Type tAVX512_VPOPCNTDQ = uint64_t(1) << 56;
411 	static const Type tAVX512_BF16 = uint64_t(1) << 57;
412 	static const Type tAVX512_VP2INTERSECT = uint64_t(1) << 58;
413 	static const Type tAMX_TILE = uint64_t(1) << 59;
414 	static const Type tAMX_INT8 = uint64_t(1) << 60;
415 	static const Type tAMX_BF16 = uint64_t(1) << 61;
416 	static const Type tAVX_VNNI = uint64_t(1) << 62;
417 
Cpu()418 	Cpu()
419 		: type_(NONE)
420 		, x2APIC_supported_(false)
421 		, numCores_()
422 		, dataCacheSize_()
423 		, coresSharignDataCache_()
424 		, dataCacheLevels_(0)
425 	{
426 		unsigned int data[4] = {};
427 		const unsigned int& EAX = data[0];
428 		const unsigned int& EBX = data[1];
429 		const unsigned int& ECX = data[2];
430 		const unsigned int& EDX = data[3];
431 		getCpuid(0, data);
432 		const unsigned int maxNum = EAX;
433 		static const char intel[] = "ntel";
434 		static const char amd[] = "cAMD";
435 		if (ECX == get32bitAsBE(amd)) {
436 			type_ |= tAMD;
437 			getCpuid(0x80000001, data);
438 			if (EDX & (1U << 31)) {
439 				type_ |= t3DN;
440 				// 3DNow! implies support for PREFETCHW on AMD
441 				type_ |= tPREFETCHW;
442 			}
443 
444 			if (EDX & (1U << 29)) {
445 				// Long mode implies support for PREFETCHW on AMD
446 				type_ |= tPREFETCHW;
447 			}
448 		}
449 		if (ECX == get32bitAsBE(intel)) {
450 			type_ |= tINTEL;
451 		}
452 
453 		// Extended flags information
454 		getCpuid(0x80000000, data);
455 		if (EAX >= 0x80000001) {
456 			getCpuid(0x80000001, data);
457 
458 			if (EDX & (1U << 31)) type_ |= t3DN;
459 			if (EDX & (1U << 30)) type_ |= tE3DN;
460 			if (EDX & (1U << 27)) type_ |= tRDTSCP;
461 			if (EDX & (1U << 22)) type_ |= tMMX2;
462 			if (EDX & (1U << 15)) type_ |= tCMOV;
463 			if (ECX & (1U << 5)) type_ |= tLZCNT;
464 			if (ECX & (1U << 8)) type_ |= tPREFETCHW;
465 		}
466 
467 		getCpuid(1, data);
468 		if (ECX & (1U << 0)) type_ |= tSSE3;
469 		if (ECX & (1U << 9)) type_ |= tSSSE3;
470 		if (ECX & (1U << 19)) type_ |= tSSE41;
471 		if (ECX & (1U << 20)) type_ |= tSSE42;
472 		if (ECX & (1U << 22)) type_ |= tMOVBE;
473 		if (ECX & (1U << 23)) type_ |= tPOPCNT;
474 		if (ECX & (1U << 25)) type_ |= tAESNI;
475 		if (ECX & (1U << 1)) type_ |= tPCLMULQDQ;
476 		if (ECX & (1U << 27)) type_ |= tOSXSAVE;
477 		if (ECX & (1U << 30)) type_ |= tRDRAND;
478 		if (ECX & (1U << 29)) type_ |= tF16C;
479 
480 		if (EDX & (1U << 15)) type_ |= tCMOV;
481 		if (EDX & (1U << 23)) type_ |= tMMX;
482 		if (EDX & (1U << 25)) type_ |= tMMX2 | tSSE;
483 		if (EDX & (1U << 26)) type_ |= tSSE2;
484 
485 		if (type_ & tOSXSAVE) {
486 			// check XFEATURE_ENABLED_MASK[2:1] = '11b'
487 			uint64_t bv = getXfeature();
488 			if ((bv & 6) == 6) {
489 				if (ECX & (1U << 28)) type_ |= tAVX;
490 				if (ECX & (1U << 12)) type_ |= tFMA;
491 				// do *not* check AVX-512 state on macOS because it has on-demand AVX-512 support
492 #if !defined(__APPLE__)
493 				if (((bv >> 5) & 7) == 7)
494 #endif
495 				{
496 					getCpuidEx(7, 0, data);
497 					if (EBX & (1U << 16)) type_ |= tAVX512F;
498 					if (type_ & tAVX512F) {
499 						if (EBX & (1U << 17)) type_ |= tAVX512DQ;
500 						if (EBX & (1U << 21)) type_ |= tAVX512_IFMA;
501 						if (EBX & (1U << 26)) type_ |= tAVX512PF;
502 						if (EBX & (1U << 27)) type_ |= tAVX512ER;
503 						if (EBX & (1U << 28)) type_ |= tAVX512CD;
504 						if (EBX & (1U << 30)) type_ |= tAVX512BW;
505 						if (EBX & (1U << 31)) type_ |= tAVX512VL;
506 						if (ECX & (1U << 1)) type_ |= tAVX512_VBMI;
507 						if (ECX & (1U << 6)) type_ |= tAVX512_VBMI2;
508 						if (ECX & (1U << 8)) type_ |= tGFNI;
509 						if (ECX & (1U << 9)) type_ |= tVAES;
510 						if (ECX & (1U << 10)) type_ |= tVPCLMULQDQ;
511 						if (ECX & (1U << 11)) type_ |= tAVX512_VNNI;
512 						if (ECX & (1U << 12)) type_ |= tAVX512_BITALG;
513 						if (ECX & (1U << 14)) type_ |= tAVX512_VPOPCNTDQ;
514 						if (EDX & (1U << 2)) type_ |= tAVX512_4VNNIW;
515 						if (EDX & (1U << 3)) type_ |= tAVX512_4FMAPS;
516 						if (EDX & (1U << 8)) type_ |= tAVX512_VP2INTERSECT;
517 					}
518 				}
519 			}
520 		}
521 		if (maxNum >= 7) {
522 			getCpuidEx(7, 0, data);
523 			const uint32_t maxNumSubLeaves = EAX;
524 			if (type_ & tAVX && (EBX & (1U << 5))) type_ |= tAVX2;
525 			if (EBX & (1U << 3)) type_ |= tBMI1;
526 			if (EBX & (1U << 8)) type_ |= tBMI2;
527 			if (EBX & (1U << 9)) type_ |= tENHANCED_REP;
528 			if (EBX & (1U << 18)) type_ |= tRDSEED;
529 			if (EBX & (1U << 19)) type_ |= tADX;
530 			if (EBX & (1U << 20)) type_ |= tSMAP;
531 			if (EBX & (1U << 4)) type_ |= tHLE;
532 			if (EBX & (1U << 11)) type_ |= tRTM;
533 			if (EBX & (1U << 14)) type_ |= tMPX;
534 			if (EBX & (1U << 29)) type_ |= tSHA;
535 			if (ECX & (1U << 0)) type_ |= tPREFETCHWT1;
536 			if (EDX & (1U << 24)) type_ |= tAMX_TILE;
537 			if (EDX & (1U << 25)) type_ |= tAMX_INT8;
538 			if (EDX & (1U << 22)) type_ |= tAMX_BF16;
539 			if (maxNumSubLeaves >= 1) {
540 				getCpuidEx(7, 1, data);
541 				if (EAX & (1U << 4)) type_ |= tAVX_VNNI;
542 				if (type_ & tAVX512F) {
543 					if (EAX & (1U << 5)) type_ |= tAVX512_BF16;
544 				}
545 			}
546 		}
547 		setFamily();
548 		setNumCores();
549 		setCacheHierarchy();
550 	}
putFamily()551 	void putFamily() const
552 	{
553 #ifndef XBYAK_ONLY_CLASS_CPU
554 		printf("family=%d, model=%X, stepping=%d, extFamily=%d, extModel=%X\n",
555 			family, model, stepping, extFamily, extModel);
556 		printf("display:family=%X, model=%X\n", displayFamily, displayModel);
557 #endif
558 	}
has(Type type)559 	bool has(Type type) const
560 	{
561 		return (type & type_) != 0;
562 	}
563 };
564 
565 #ifndef XBYAK_ONLY_CLASS_CPU
566 class Clock {
567 public:
getRdtsc()568 	static inline uint64_t getRdtsc()
569 	{
570 #ifdef XBYAK_INTEL_CPU_SPECIFIC
571 	#ifdef _MSC_VER
572 		return __rdtsc();
573 	#else
574 		unsigned int eax, edx;
575 		__asm__ volatile("rdtsc" : "=a"(eax), "=d"(edx));
576 		return ((uint64_t)edx << 32) | eax;
577 	#endif
578 #else
579 		// TODO: Need another impl of Clock or rdtsc-equivalent for non-x86 cpu
580 		return 0;
581 #endif
582 	}
Clock()583 	Clock()
584 		: clock_(0)
585 		, count_(0)
586 	{
587 	}
begin()588 	void begin()
589 	{
590 		clock_ -= getRdtsc();
591 	}
end()592 	void end()
593 	{
594 		clock_ += getRdtsc();
595 		count_++;
596 	}
getCount()597 	int getCount() const { return count_; }
getClock()598 	uint64_t getClock() const { return clock_; }
clear()599 	void clear() { count_ = 0; clock_ = 0; }
600 private:
601 	uint64_t clock_;
602 	int count_;
603 };
604 
605 #ifdef XBYAK64
606 const int UseRCX = 1 << 6;
607 const int UseRDX = 1 << 7;
608 
609 class Pack {
610 	static const size_t maxTblNum = 15;
611 	const Xbyak::Reg64 *tbl_[maxTblNum];
612 	size_t n_;
613 public:
Pack()614 	Pack() : tbl_(), n_(0) {}
Pack(const Xbyak::Reg64 * tbl,size_t n)615 	Pack(const Xbyak::Reg64 *tbl, size_t n) { init(tbl, n); }
Pack(const Pack & rhs)616 	Pack(const Pack& rhs)
617 		: n_(rhs.n_)
618 	{
619 		for (size_t i = 0; i < n_; i++) tbl_[i] = rhs.tbl_[i];
620 	}
621 	Pack& operator=(const Pack& rhs)
622 	{
623 		n_ = rhs.n_;
624 		for (size_t i = 0; i < n_; i++) tbl_[i] = rhs.tbl_[i];
625 		return *this;
626 	}
Pack(const Xbyak::Reg64 & t0)627 	Pack(const Xbyak::Reg64& t0)
628 	{ n_ = 1; tbl_[0] = &t0; }
Pack(const Xbyak::Reg64 & t1,const Xbyak::Reg64 & t0)629 	Pack(const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
630 	{ n_ = 2; tbl_[0] = &t0; tbl_[1] = &t1; }
Pack(const Xbyak::Reg64 & t2,const Xbyak::Reg64 & t1,const Xbyak::Reg64 & t0)631 	Pack(const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
632 	{ n_ = 3; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; }
Pack(const Xbyak::Reg64 & t3,const Xbyak::Reg64 & t2,const Xbyak::Reg64 & t1,const Xbyak::Reg64 & t0)633 	Pack(const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
634 	{ n_ = 4; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; }
Pack(const Xbyak::Reg64 & t4,const Xbyak::Reg64 & t3,const Xbyak::Reg64 & t2,const Xbyak::Reg64 & t1,const Xbyak::Reg64 & t0)635 	Pack(const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
636 	{ n_ = 5; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; }
Pack(const Xbyak::Reg64 & t5,const Xbyak::Reg64 & t4,const Xbyak::Reg64 & t3,const Xbyak::Reg64 & t2,const Xbyak::Reg64 & t1,const Xbyak::Reg64 & t0)637 	Pack(const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
638 	{ n_ = 6; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; }
Pack(const Xbyak::Reg64 & t6,const Xbyak::Reg64 & t5,const Xbyak::Reg64 & t4,const Xbyak::Reg64 & t3,const Xbyak::Reg64 & t2,const Xbyak::Reg64 & t1,const Xbyak::Reg64 & t0)639 	Pack(const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
640 	{ n_ = 7; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; tbl_[6] = &t6; }
Pack(const Xbyak::Reg64 & t7,const Xbyak::Reg64 & t6,const Xbyak::Reg64 & t5,const Xbyak::Reg64 & t4,const Xbyak::Reg64 & t3,const Xbyak::Reg64 & t2,const Xbyak::Reg64 & t1,const Xbyak::Reg64 & t0)641 	Pack(const Xbyak::Reg64& t7, const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
642 	{ n_ = 8; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; tbl_[6] = &t6; tbl_[7] = &t7; }
Pack(const Xbyak::Reg64 & t8,const Xbyak::Reg64 & t7,const Xbyak::Reg64 & t6,const Xbyak::Reg64 & t5,const Xbyak::Reg64 & t4,const Xbyak::Reg64 & t3,const Xbyak::Reg64 & t2,const Xbyak::Reg64 & t1,const Xbyak::Reg64 & t0)643 	Pack(const Xbyak::Reg64& t8, const Xbyak::Reg64& t7, const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
644 	{ n_ = 9; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; tbl_[6] = &t6; tbl_[7] = &t7; tbl_[8] = &t8; }
Pack(const Xbyak::Reg64 & t9,const Xbyak::Reg64 & t8,const Xbyak::Reg64 & t7,const Xbyak::Reg64 & t6,const Xbyak::Reg64 & t5,const Xbyak::Reg64 & t4,const Xbyak::Reg64 & t3,const Xbyak::Reg64 & t2,const Xbyak::Reg64 & t1,const Xbyak::Reg64 & t0)645 	Pack(const Xbyak::Reg64& t9, const Xbyak::Reg64& t8, const Xbyak::Reg64& t7, const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
646 	{ n_ = 10; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; tbl_[6] = &t6; tbl_[7] = &t7; tbl_[8] = &t8; tbl_[9] = &t9; }
append(const Xbyak::Reg64 & t)647 	Pack& append(const Xbyak::Reg64& t)
648 	{
649 		if (n_ == maxTblNum) {
650 			fprintf(stderr, "ERR Pack::can't append\n");
651 			XBYAK_THROW_RET(ERR_BAD_PARAMETER, *this)
652 		}
653 		tbl_[n_++] = &t;
654 		return *this;
655 	}
init(const Xbyak::Reg64 * tbl,size_t n)656 	void init(const Xbyak::Reg64 *tbl, size_t n)
657 	{
658 		if (n > maxTblNum) {
659 			fprintf(stderr, "ERR Pack::init bad n=%d\n", (int)n);
660 			XBYAK_THROW(ERR_BAD_PARAMETER)
661 		}
662 		n_ = n;
663 		for (size_t i = 0; i < n; i++) {
664 			tbl_[i] = &tbl[i];
665 		}
666 	}
667 	const Xbyak::Reg64& operator[](size_t n) const
668 	{
669 		if (n >= n_) {
670 			fprintf(stderr, "ERR Pack bad n=%d(%d)\n", (int)n, (int)n_);
671 			XBYAK_THROW_RET(ERR_BAD_PARAMETER, rax)
672 		}
673 		return *tbl_[n];
674 	}
size()675 	size_t size() const { return n_; }
676 	/*
677 		get tbl[pos, pos + num)
678 	*/
679 	Pack sub(size_t pos, size_t num = size_t(-1)) const
680 	{
681 		if (num == size_t(-1)) num = n_ - pos;
682 		if (pos + num > n_) {
683 			fprintf(stderr, "ERR Pack::sub bad pos=%d, num=%d\n", (int)pos, (int)num);
684 			XBYAK_THROW_RET(ERR_BAD_PARAMETER, Pack())
685 		}
686 		Pack pack;
687 		pack.n_ = num;
688 		for (size_t i = 0; i < num; i++) {
689 			pack.tbl_[i] = tbl_[pos + i];
690 		}
691 		return pack;
692 	}
put()693 	void put() const
694 	{
695 		for (size_t i = 0; i < n_; i++) {
696 			printf("%s ", tbl_[i]->toString());
697 		}
698 		printf("\n");
699 	}
700 };
701 
702 class StackFrame {
703 #ifdef XBYAK64_WIN
704 	static const int noSaveNum = 6;
705 	static const int rcxPos = 0;
706 	static const int rdxPos = 1;
707 #else
708 	static const int noSaveNum = 8;
709 	static const int rcxPos = 3;
710 	static const int rdxPos = 2;
711 #endif
712 	static const int maxRegNum = 14; // maxRegNum = 16 - rsp - rax
713 	Xbyak::CodeGenerator *code_;
714 	int pNum_;
715 	int tNum_;
716 	bool useRcx_;
717 	bool useRdx_;
718 	int saveNum_;
719 	int P_;
720 	bool makeEpilog_;
721 	Xbyak::Reg64 pTbl_[4];
722 	Xbyak::Reg64 tTbl_[maxRegNum];
723 	Pack p_;
724 	Pack t_;
725 	StackFrame(const StackFrame&);
726 	void operator=(const StackFrame&);
727 public:
728 	const Pack& p;
729 	const Pack& t;
730 	/*
731 		make stack frame
732 		@param sf [in] this
733 		@param pNum [in] num of function parameter(0 <= pNum <= 4)
734 		@param tNum [in] num of temporary register(0 <= tNum, with UseRCX, UseRDX) #{pNum + tNum [+rcx] + [rdx]} <= 14
735 		@param stackSizeByte [in] local stack size
736 		@param makeEpilog [in] automatically call close() if true
737 
738 		you can use
739 		rax
740 		gp0, ..., gp(pNum - 1)
741 		gt0, ..., gt(tNum-1)
742 		rcx if tNum & UseRCX
743 		rdx if tNum & UseRDX
744 		rsp[0..stackSizeByte - 1]
745 	*/
746 	StackFrame(Xbyak::CodeGenerator *code, int pNum, int tNum = 0, int stackSizeByte = 0, bool makeEpilog = true)
code_(code)747 		: code_(code)
748 		, pNum_(pNum)
749 		, tNum_(tNum & ~(UseRCX | UseRDX))
750 		, useRcx_((tNum & UseRCX) != 0)
751 		, useRdx_((tNum & UseRDX) != 0)
752 		, saveNum_(0)
753 		, P_(0)
754 		, makeEpilog_(makeEpilog)
755 		, p(p_)
756 		, t(t_)
757 	{
758 		using namespace Xbyak;
759 		if (pNum < 0 || pNum > 4) XBYAK_THROW(ERR_BAD_PNUM)
760 		const int allRegNum = pNum + tNum_ + (useRcx_ ? 1 : 0) + (useRdx_ ? 1 : 0);
761 		if (tNum_ < 0 || allRegNum > maxRegNum) XBYAK_THROW(ERR_BAD_TNUM)
762 		const Reg64& _rsp = code->rsp;
763 		saveNum_ = (std::max)(0, allRegNum - noSaveNum);
764 		const int *tbl = getOrderTbl() + noSaveNum;
765 		for (int i = 0; i < saveNum_; i++) {
766 			code->push(Reg64(tbl[i]));
767 		}
768 		P_ = (stackSizeByte + 7) / 8;
769 		if (P_ > 0 && (P_ & 1) == (saveNum_ & 1)) P_++; // (rsp % 16) == 8, then increment P_ for 16 byte alignment
770 		P_ *= 8;
771 		if (P_ > 0) code->sub(_rsp, P_);
772 		int pos = 0;
773 		for (int i = 0; i < pNum; i++) {
774 			pTbl_[i] = Xbyak::Reg64(getRegIdx(pos));
775 		}
776 		for (int i = 0; i < tNum_; i++) {
777 			tTbl_[i] = Xbyak::Reg64(getRegIdx(pos));
778 		}
779 		if (useRcx_ && rcxPos < pNum) code_->mov(code_->r10, code_->rcx);
780 		if (useRdx_ && rdxPos < pNum) code_->mov(code_->r11, code_->rdx);
781 		p_.init(pTbl_, pNum);
782 		t_.init(tTbl_, tNum_);
783 	}
784 	/*
785 		make epilog manually
786 		@param callRet [in] call ret() if true
787 	*/
788 	void close(bool callRet = true)
789 	{
790 		using namespace Xbyak;
791 		const Reg64& _rsp = code_->rsp;
792 		const int *tbl = getOrderTbl() + noSaveNum;
793 		if (P_ > 0) code_->add(_rsp, P_);
794 		for (int i = 0; i < saveNum_; i++) {
795 			code_->pop(Reg64(tbl[saveNum_ - 1 - i]));
796 		}
797 
798 		if (callRet) code_->ret();
799 	}
~StackFrame()800 	~StackFrame()
801 	{
802 		if (!makeEpilog_) return;
803 		close();
804 	}
805 private:
getOrderTbl()806 	const int *getOrderTbl() const
807 	{
808 		using namespace Xbyak;
809 		static const int tbl[] = {
810 #ifdef XBYAK64_WIN
811 			Operand::RCX, Operand::RDX, Operand::R8, Operand::R9, Operand::R10, Operand::R11, Operand::RDI, Operand::RSI,
812 #else
813 			Operand::RDI, Operand::RSI, Operand::RDX, Operand::RCX, Operand::R8, Operand::R9, Operand::R10, Operand::R11,
814 #endif
815 			Operand::RBX, Operand::RBP, Operand::R12, Operand::R13, Operand::R14, Operand::R15
816 		};
817 		return &tbl[0];
818 	}
getRegIdx(int & pos)819 	int getRegIdx(int& pos) const
820 	{
821 		assert(pos < maxRegNum);
822 		using namespace Xbyak;
823 		const int *tbl = getOrderTbl();
824 		int r = tbl[pos++];
825 		if (useRcx_) {
826 			if (r == Operand::RCX) { return Operand::R10; }
827 			if (r == Operand::R10) { r = tbl[pos++]; }
828 		}
829 		if (useRdx_) {
830 			if (r == Operand::RDX) { return Operand::R11; }
831 			if (r == Operand::R11) { return tbl[pos++]; }
832 		}
833 		return r;
834 	}
835 };
836 #endif
837 
838 class Profiler {
839 	int mode_;
840 	const char *suffix_;
841 	const void *startAddr_;
842 #ifdef XBYAK_USE_PERF
843 	FILE *fp_;
844 #endif
845 public:
846 	enum {
847 		None = 0,
848 		Perf = 1,
849 		VTune = 2
850 	};
Profiler()851 	Profiler()
852 		: mode_(None)
853 		, suffix_("")
854 		, startAddr_(0)
855 #ifdef XBYAK_USE_PERF
856 		, fp_(0)
857 #endif
858 	{
859 	}
860 	// append suffix to funcName
setNameSuffix(const char * suffix)861 	void setNameSuffix(const char *suffix)
862 	{
863 		suffix_ = suffix;
864 	}
setStartAddr(const void * startAddr)865 	void setStartAddr(const void *startAddr)
866 	{
867 		startAddr_ = startAddr;
868 	}
init(int mode)869 	void init(int mode)
870 	{
871 		mode_ = None;
872 		switch (mode) {
873 		default:
874 		case None:
875 			return;
876 		case Perf:
877 #ifdef XBYAK_USE_PERF
878 			close();
879 			{
880 				const int pid = getpid();
881 				char name[128];
882 				snprintf(name, sizeof(name), "/tmp/perf-%d.map", pid);
883 				fp_ = fopen(name, "a+");
884 				if (fp_ == 0) {
885 					fprintf(stderr, "can't open %s\n", name);
886 					return;
887 				}
888 			}
889 			mode_ = Perf;
890 #endif
891 			return;
892 		case VTune:
893 #ifdef XBYAK_USE_VTUNE
894 			dlopen("dummy", RTLD_LAZY); // force to load dlopen to enable jit profiling
895 			if (iJIT_IsProfilingActive() != iJIT_SAMPLING_ON) {
896 				fprintf(stderr, "VTune profiling is not active\n");
897 				return;
898 			}
899 			mode_ = VTune;
900 #endif
901 			return;
902 		}
903 	}
~Profiler()904 	~Profiler()
905 	{
906 		close();
907 	}
close()908 	void close()
909 	{
910 #ifdef XBYAK_USE_PERF
911 		if (fp_ == 0) return;
912 		fclose(fp_);
913 		fp_ = 0;
914 #endif
915 	}
set(const char * funcName,const void * startAddr,size_t funcSize)916 	void set(const char *funcName, const void *startAddr, size_t funcSize) const
917 	{
918 		if (mode_ == None) return;
919 #if !defined(XBYAK_USE_PERF) && !defined(XBYAK_USE_VTUNE)
920 		(void)funcName;
921 		(void)startAddr;
922 		(void)funcSize;
923 #endif
924 #ifdef XBYAK_USE_PERF
925 		if (mode_ == Perf) {
926 			if (fp_ == 0) return;
927 			fprintf(fp_, "%llx %zx %s%s", (long long)startAddr, funcSize, funcName, suffix_);
928 			/*
929 				perf does not recognize the function name which is less than 3,
930 				so append '_' at the end of the name if necessary
931 			*/
932 			size_t n = strlen(funcName) + strlen(suffix_);
933 			for (size_t i = n; i < 3; i++) {
934 				fprintf(fp_, "_");
935 			}
936 			fprintf(fp_, "\n");
937 			fflush(fp_);
938 		}
939 #endif
940 #ifdef XBYAK_USE_VTUNE
941 		if (mode_ != VTune) return;
942 		char className[] = "";
943 		char fileName[] = "";
944 		iJIT_Method_Load jmethod = {};
945 		jmethod.method_id = iJIT_GetNewMethodID();
946 		jmethod.class_file_name = className;
947 		jmethod.source_file_name = fileName;
948 		jmethod.method_load_address = const_cast<void*>(startAddr);
949 		jmethod.method_size = funcSize;
950 		jmethod.line_number_size = 0;
951 		char buf[128];
952 		snprintf(buf, sizeof(buf), "%s%s", funcName, suffix_);
953 		jmethod.method_name = buf;
954 		iJIT_NotifyEvent(iJVM_EVENT_TYPE_METHOD_LOAD_FINISHED, (void*)&jmethod);
955 #endif
956 	}
957 	/*
958 		for continuous set
959 		funcSize = endAddr - <previous set endAddr>
960 	*/
set(const char * funcName,const void * endAddr)961 	void set(const char *funcName, const void *endAddr)
962 	{
963 		set(funcName, startAddr_, (size_t)endAddr - (size_t)startAddr_);
964 		startAddr_ = endAddr;
965 	}
966 };
967 #endif // XBYAK_ONLY_CLASS_CPU
968 
969 } } // end of util
970 
971 #endif
972