1 #ifndef XBYAK_XBYAK_UTIL_H_
2 #define XBYAK_XBYAK_UTIL_H_
3 #include <string.h>
4 
5 /**
6 	utility class and functions for Xbyak
7 	Xbyak::util::Clock ; rdtsc timer
8 	Xbyak::util::Cpu ; detect CPU
9 	@note this header is UNDER CONSTRUCTION!
10 */
11 #include "xbyak.h"
12 
13 #if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
14 	#define XBYAK_INTEL_CPU_SPECIFIC
15 #endif
16 
17 #ifdef XBYAK_INTEL_CPU_SPECIFIC
18 #ifdef _MSC_VER
19 	#if (_MSC_VER < 1400) && defined(XBYAK32)
__cpuid(int[4],int)20 		static inline __declspec(naked) void __cpuid(int[4], int)
21 		{
22 			__asm {
23 				push	ebx
24 				push	esi
25 				mov		eax, dword ptr [esp + 4 * 2 + 8] // eaxIn
26 				cpuid
27 				mov		esi, dword ptr [esp + 4 * 2 + 4] // data
28 				mov		dword ptr [esi], eax
29 				mov		dword ptr [esi + 4], ebx
30 				mov		dword ptr [esi + 8], ecx
31 				mov		dword ptr [esi + 12], edx
32 				pop		esi
33 				pop		ebx
34 				ret
35 			}
36 		}
37 	#else
38 		#include <intrin.h> // for __cpuid
39 	#endif
40 #else
41 	#ifndef __GNUC_PREREQ
42     	#define __GNUC_PREREQ(major, minor) ((((__GNUC__) << 16) + (__GNUC_MINOR__)) >= (((major) << 16) + (minor)))
43 	#endif
44 	#if __GNUC_PREREQ(4, 3) && !defined(__APPLE__)
45 		#include <cpuid.h>
46 	#else
47 		#if defined(__APPLE__) && defined(XBYAK32) // avoid err : can't find a register in class `BREG' while reloading `asm'
48 			#define __cpuid(eaxIn, a, b, c, d) __asm__ __volatile__("pushl %%ebx\ncpuid\nmovl %%ebp, %%esi\npopl %%ebx" : "=a"(a), "=S"(b), "=c"(c), "=d"(d) : "0"(eaxIn))
49 			#define __cpuid_count(eaxIn, ecxIn, a, b, c, d) __asm__ __volatile__("pushl %%ebx\ncpuid\nmovl %%ebp, %%esi\npopl %%ebx" : "=a"(a), "=S"(b), "=c"(c), "=d"(d) : "0"(eaxIn), "2"(ecxIn))
50 		#else
51 			#define __cpuid(eaxIn, a, b, c, d) __asm__ __volatile__("cpuid\n" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "0"(eaxIn))
52 			#define __cpuid_count(eaxIn, ecxIn, a, b, c, d) __asm__ __volatile__("cpuid\n" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "0"(eaxIn), "2"(ecxIn))
53 		#endif
54 	#endif
55 #endif
56 #endif
57 
58 #ifdef XBYAK_USE_VTUNE
59 	// -I /opt/intel/vtune_amplifier/include/ -L /opt/intel/vtune_amplifier/lib64 -ljitprofiling -ldl
60 	#include <jitprofiling.h>
61 	#ifdef _MSC_VER
62 		#pragma comment(lib, "libittnotify.lib")
63 	#endif
64 	#ifdef __linux__
65 		#include <dlfcn.h>
66 	#endif
67 #endif
68 #ifdef __linux__
69 	#define XBYAK_USE_PERF
70 #endif
71 
72 namespace Xbyak { namespace util {
73 
74 typedef enum {
75    SmtLevel = 1,
76    CoreLevel = 2
77 } IntelCpuTopologyLevel;
78 
79 /**
80 	CPU detection class
81 */
82 class Cpu {
83 	uint64 type_;
84 	//system topology
85 	bool x2APIC_supported_;
86 	static const size_t maxTopologyLevels = 2;
87 	unsigned int numCores_[maxTopologyLevels];
88 
89 	static const unsigned int maxNumberCacheLevels = 10;
90 	unsigned int dataCacheSize_[maxNumberCacheLevels];
91 	unsigned int coresSharignDataCache_[maxNumberCacheLevels];
92 	unsigned int dataCacheLevels_;
93 
get32bitAsBE(const char * x)94 	unsigned int get32bitAsBE(const char *x) const
95 	{
96 		return x[0] | (x[1] << 8) | (x[2] << 16) | (x[3] << 24);
97 	}
mask(int n)98 	unsigned int mask(int n) const
99 	{
100 		return (1U << n) - 1;
101 	}
setFamily()102 	void setFamily()
103 	{
104 		unsigned int data[4] = {};
105 		getCpuid(1, data);
106 		stepping = data[0] & mask(4);
107 		model = (data[0] >> 4) & mask(4);
108 		family = (data[0] >> 8) & mask(4);
109 		// type = (data[0] >> 12) & mask(2);
110 		extModel = (data[0] >> 16) & mask(4);
111 		extFamily = (data[0] >> 20) & mask(8);
112 		if (family == 0x0f) {
113 			displayFamily = family + extFamily;
114 		} else {
115 			displayFamily = family;
116 		}
117 		if (family == 6 || family == 0x0f) {
118 			displayModel = (extModel << 4) + model;
119 		} else {
120 			displayModel = model;
121 		}
122 	}
extractBit(unsigned int val,unsigned int base,unsigned int end)123 	unsigned int extractBit(unsigned int val, unsigned int base, unsigned int end)
124 	{
125 		return (val >> base) & ((1u << (end - base)) - 1);
126 	}
setNumCores()127 	void setNumCores()
128 	{
129 		if ((type_ & tINTEL) == 0) return;
130 
131 		unsigned int data[4] = {};
132 
133 		 /* CAUTION: These numbers are configuration as shipped by Intel. */
134 		getCpuidEx(0x0, 0, data);
135 		if (data[0] >= 0xB) {
136 			 /*
137 				if leaf 11 exists(x2APIC is supported),
138 				we use it to get the number of smt cores and cores on socket
139 
140 				leaf 0xB can be zeroed-out by a hypervisor
141 			*/
142 			x2APIC_supported_ = true;
143 			for (unsigned int i = 0; i < maxTopologyLevels; i++) {
144 				getCpuidEx(0xB, i, data);
145 				IntelCpuTopologyLevel level = (IntelCpuTopologyLevel)extractBit(data[2], 8, 15);
146 				if (level == SmtLevel || level == CoreLevel) {
147 					numCores_[level - 1] = extractBit(data[1], 0, 15);
148 				}
149 			}
150 			/*
151 				Fallback values in case a hypervisor has 0xB leaf zeroed-out.
152 			*/
153 			numCores_[SmtLevel - 1] = (std::max)(1u, numCores_[SmtLevel - 1]);
154 			numCores_[CoreLevel - 1] = (std::max)(numCores_[SmtLevel - 1], numCores_[CoreLevel - 1]);
155 		} else {
156 			/*
157 				Failed to deremine num of cores without x2APIC support.
158 				TODO: USE initial APIC ID to determine ncores.
159 			*/
160 			numCores_[SmtLevel - 1] = 0;
161 			numCores_[CoreLevel - 1] = 0;
162 		}
163 
164 	}
setCacheHierarchy()165 	void setCacheHierarchy()
166 	{
167 		if ((type_ & tINTEL) == 0) return;
168 		const unsigned int NO_CACHE = 0;
169 		const unsigned int DATA_CACHE = 1;
170 //		const unsigned int INSTRUCTION_CACHE = 2;
171 		const unsigned int UNIFIED_CACHE = 3;
172 		unsigned int smt_width = 0;
173 		unsigned int logical_cores = 0;
174 		unsigned int data[4] = {};
175 
176 		if (x2APIC_supported_) {
177 			smt_width = numCores_[0];
178 			logical_cores = numCores_[1];
179 		}
180 
181 		/*
182 			Assumptions:
183 			the first level of data cache is not shared (which is the
184 			case for every existing architecture) and use this to
185 			determine the SMT width for arch not supporting leaf 11.
186 			when leaf 4 reports a number of core less than numCores_
187 			on socket reported by leaf 11, then it is a correct number
188 			of cores not an upperbound.
189 		*/
190 		for (int i = 0; dataCacheLevels_ < maxNumberCacheLevels; i++) {
191 			getCpuidEx(0x4, i, data);
192 			unsigned int cacheType = extractBit(data[0], 0, 4);
193 			if (cacheType == NO_CACHE) break;
194 			if (cacheType == DATA_CACHE || cacheType == UNIFIED_CACHE) {
195 				unsigned int actual_logical_cores = extractBit(data[0], 14, 25) + 1;
196 				if (logical_cores != 0) { // true only if leaf 0xB is supported and valid
197 					actual_logical_cores = (std::min)(actual_logical_cores, logical_cores);
198 				}
199 				assert(actual_logical_cores != 0);
200 				dataCacheSize_[dataCacheLevels_] =
201 					(extractBit(data[1], 22, 31) + 1)
202 					* (extractBit(data[1], 12, 21) + 1)
203 					* (extractBit(data[1], 0, 11) + 1)
204 					* (data[2] + 1);
205 				if (cacheType == DATA_CACHE && smt_width == 0) smt_width = actual_logical_cores;
206 				assert(smt_width != 0);
207 				coresSharignDataCache_[dataCacheLevels_] = (std::max)(actual_logical_cores / smt_width, 1u);
208 				dataCacheLevels_++;
209 			}
210 		}
211 	}
212 
213 public:
214 	int model;
215 	int family;
216 	int stepping;
217 	int extModel;
218 	int extFamily;
219 	int displayFamily; // family + extFamily
220 	int displayModel; // model + extModel
221 
getNumCores(IntelCpuTopologyLevel level)222 	unsigned int getNumCores(IntelCpuTopologyLevel level) const {
223 		if (!x2APIC_supported_) XBYAK_THROW_RET(ERR_X2APIC_IS_NOT_SUPPORTED, 0)
224 		switch (level) {
225 		case SmtLevel: return numCores_[level - 1];
226 		case CoreLevel: return numCores_[level - 1] / numCores_[SmtLevel - 1];
227 		default: XBYAK_THROW_RET(ERR_X2APIC_IS_NOT_SUPPORTED, 0)
228 		}
229 	}
230 
getDataCacheLevels()231 	unsigned int getDataCacheLevels() const { return dataCacheLevels_; }
getCoresSharingDataCache(unsigned int i)232 	unsigned int getCoresSharingDataCache(unsigned int i) const
233 	{
234 		if (i >= dataCacheLevels_) XBYAK_THROW_RET(ERR_BAD_PARAMETER, 0)
235 		return coresSharignDataCache_[i];
236 	}
getDataCacheSize(unsigned int i)237 	unsigned int getDataCacheSize(unsigned int i) const
238 	{
239 		if (i >= dataCacheLevels_) XBYAK_THROW_RET(ERR_BAD_PARAMETER, 0)
240 		return dataCacheSize_[i];
241 	}
242 
243 	/*
244 		data[] = { eax, ebx, ecx, edx }
245 	*/
getCpuid(unsigned int eaxIn,unsigned int data[4])246 	static inline void getCpuid(unsigned int eaxIn, unsigned int data[4])
247 	{
248 #ifdef XBYAK_INTEL_CPU_SPECIFIC
249 	#ifdef _MSC_VER
250 		__cpuid(reinterpret_cast<int*>(data), eaxIn);
251 	#else
252 		__cpuid(eaxIn, data[0], data[1], data[2], data[3]);
253 	#endif
254 #else
255 		(void)eaxIn;
256 		(void)data;
257 #endif
258 	}
getCpuidEx(unsigned int eaxIn,unsigned int ecxIn,unsigned int data[4])259 	static inline void getCpuidEx(unsigned int eaxIn, unsigned int ecxIn, unsigned int data[4])
260 	{
261 #ifdef XBYAK_INTEL_CPU_SPECIFIC
262 	#ifdef _MSC_VER
263 		__cpuidex(reinterpret_cast<int*>(data), eaxIn, ecxIn);
264 	#else
265 		__cpuid_count(eaxIn, ecxIn, data[0], data[1], data[2], data[3]);
266 	#endif
267 #else
268 		(void)eaxIn;
269 		(void)ecxIn;
270 		(void)data;
271 #endif
272 	}
getXfeature()273 	static inline uint64 getXfeature()
274 	{
275 #ifdef XBYAK_INTEL_CPU_SPECIFIC
276 	#ifdef _MSC_VER
277 		return _xgetbv(0);
278 	#else
279 		unsigned int eax, edx;
280 		// xgetvb is not support on gcc 4.2
281 //		__asm__ volatile("xgetbv" : "=a"(eax), "=d"(edx) : "c"(0));
282 		__asm__ volatile(".byte 0x0f, 0x01, 0xd0" : "=a"(eax), "=d"(edx) : "c"(0));
283 		return ((uint64)edx << 32) | eax;
284 	#endif
285 #else
286 		return 0;
287 #endif
288 	}
289 	typedef uint64 Type;
290 
291 	static const Type NONE = 0;
292 	static const Type tMMX = 1 << 0;
293 	static const Type tMMX2 = 1 << 1;
294 	static const Type tCMOV = 1 << 2;
295 	static const Type tSSE = 1 << 3;
296 	static const Type tSSE2 = 1 << 4;
297 	static const Type tSSE3 = 1 << 5;
298 	static const Type tSSSE3 = 1 << 6;
299 	static const Type tSSE41 = 1 << 7;
300 	static const Type tSSE42 = 1 << 8;
301 	static const Type tPOPCNT = 1 << 9;
302 	static const Type tAESNI = 1 << 10;
303 	static const Type tSSE5 = 1 << 11;
304 	static const Type tOSXSAVE = 1 << 12;
305 	static const Type tPCLMULQDQ = 1 << 13;
306 	static const Type tAVX = 1 << 14;
307 	static const Type tFMA = 1 << 15;
308 
309 	static const Type t3DN = 1 << 16;
310 	static const Type tE3DN = 1 << 17;
311 	static const Type tSSE4a = 1 << 18;
312 	static const Type tRDTSCP = 1 << 19;
313 	static const Type tAVX2 = 1 << 20;
314 	static const Type tBMI1 = 1 << 21; // andn, bextr, blsi, blsmsk, blsr, tzcnt
315 	static const Type tBMI2 = 1 << 22; // bzhi, mulx, pdep, pext, rorx, sarx, shlx, shrx
316 	static const Type tLZCNT = 1 << 23;
317 
318 	static const Type tINTEL = 1 << 24;
319 	static const Type tAMD = 1 << 25;
320 
321 	static const Type tENHANCED_REP = 1 << 26; // enhanced rep movsb/stosb
322 	static const Type tRDRAND = 1 << 27;
323 	static const Type tADX = 1 << 28; // adcx, adox
324 	static const Type tRDSEED = 1 << 29; // rdseed
325 	static const Type tSMAP = 1 << 30; // stac
326 	static const Type tHLE = uint64(1) << 31; // xacquire, xrelease, xtest
327 	static const Type tRTM = uint64(1) << 32; // xbegin, xend, xabort
328 	static const Type tF16C = uint64(1) << 33; // vcvtph2ps, vcvtps2ph
329 	static const Type tMOVBE = uint64(1) << 34; // mobve
330 	static const Type tAVX512F = uint64(1) << 35;
331 	static const Type tAVX512DQ = uint64(1) << 36;
332 	static const Type tAVX512_IFMA = uint64(1) << 37;
333 	static const Type tAVX512IFMA = tAVX512_IFMA;
334 	static const Type tAVX512PF = uint64(1) << 38;
335 	static const Type tAVX512ER = uint64(1) << 39;
336 	static const Type tAVX512CD = uint64(1) << 40;
337 	static const Type tAVX512BW = uint64(1) << 41;
338 	static const Type tAVX512VL = uint64(1) << 42;
339 	static const Type tAVX512_VBMI = uint64(1) << 43;
340 	static const Type tAVX512VBMI = tAVX512_VBMI; // changed by Intel's manual
341 	static const Type tAVX512_4VNNIW = uint64(1) << 44;
342 	static const Type tAVX512_4FMAPS = uint64(1) << 45;
343 	static const Type tPREFETCHWT1 = uint64(1) << 46;
344 	static const Type tPREFETCHW = uint64(1) << 47;
345 	static const Type tSHA = uint64(1) << 48;
346 	static const Type tMPX = uint64(1) << 49;
347 	static const Type tAVX512_VBMI2 = uint64(1) << 50;
348 	static const Type tGFNI = uint64(1) << 51;
349 	static const Type tVAES = uint64(1) << 52;
350 	static const Type tVPCLMULQDQ = uint64(1) << 53;
351 	static const Type tAVX512_VNNI = uint64(1) << 54;
352 	static const Type tAVX512_BITALG = uint64(1) << 55;
353 	static const Type tAVX512_VPOPCNTDQ = uint64(1) << 56;
354 	static const Type tAVX512_BF16 = uint64(1) << 57;
355 	static const Type tAVX512_VP2INTERSECT = uint64(1) << 58;
356 	static const Type tAMX_TILE = uint64(1) << 59;
357 	static const Type tAMX_INT8 = uint64(1) << 60;
358 	static const Type tAMX_BF16 = uint64(1) << 61;
359 
Cpu()360 	Cpu()
361 		: type_(NONE)
362 		, x2APIC_supported_(false)
363 		, numCores_()
364 		, dataCacheSize_()
365 		, coresSharignDataCache_()
366 		, dataCacheLevels_(0)
367 	{
368 		unsigned int data[4] = {};
369 		const unsigned int& EAX = data[0];
370 		const unsigned int& EBX = data[1];
371 		const unsigned int& ECX = data[2];
372 		const unsigned int& EDX = data[3];
373 		getCpuid(0, data);
374 		const unsigned int maxNum = EAX;
375 		static const char intel[] = "ntel";
376 		static const char amd[] = "cAMD";
377 		if (ECX == get32bitAsBE(amd)) {
378 			type_ |= tAMD;
379 			getCpuid(0x80000001, data);
380 			if (EDX & (1U << 31)) type_ |= t3DN;
381 			if (EDX & (1U << 15)) type_ |= tCMOV;
382 			if (EDX & (1U << 30)) type_ |= tE3DN;
383 			if (EDX & (1U << 22)) type_ |= tMMX2;
384 			if (EDX & (1U << 27)) type_ |= tRDTSCP;
385 		}
386 		if (ECX == get32bitAsBE(intel)) {
387 			type_ |= tINTEL;
388 			getCpuid(0x80000001, data);
389 			if (EDX & (1U << 27)) type_ |= tRDTSCP;
390 			if (ECX & (1U << 5)) type_ |= tLZCNT;
391 			if (ECX & (1U << 8)) type_ |= tPREFETCHW;
392 		}
393 		getCpuid(1, data);
394 		if (ECX & (1U << 0)) type_ |= tSSE3;
395 		if (ECX & (1U << 9)) type_ |= tSSSE3;
396 		if (ECX & (1U << 19)) type_ |= tSSE41;
397 		if (ECX & (1U << 20)) type_ |= tSSE42;
398 		if (ECX & (1U << 22)) type_ |= tMOVBE;
399 		if (ECX & (1U << 23)) type_ |= tPOPCNT;
400 		if (ECX & (1U << 25)) type_ |= tAESNI;
401 		if (ECX & (1U << 1)) type_ |= tPCLMULQDQ;
402 		if (ECX & (1U << 27)) type_ |= tOSXSAVE;
403 		if (ECX & (1U << 30)) type_ |= tRDRAND;
404 		if (ECX & (1U << 29)) type_ |= tF16C;
405 
406 		if (EDX & (1U << 15)) type_ |= tCMOV;
407 		if (EDX & (1U << 23)) type_ |= tMMX;
408 		if (EDX & (1U << 25)) type_ |= tMMX2 | tSSE;
409 		if (EDX & (1U << 26)) type_ |= tSSE2;
410 
411 		if (type_ & tOSXSAVE) {
412 			// check XFEATURE_ENABLED_MASK[2:1] = '11b'
413 			uint64 bv = getXfeature();
414 			if ((bv & 6) == 6) {
415 				if (ECX & (1U << 28)) type_ |= tAVX;
416 				if (ECX & (1U << 12)) type_ |= tFMA;
417 				if (((bv >> 5) & 7) == 7) {
418 					getCpuidEx(7, 0, data);
419 					if (EBX & (1U << 16)) type_ |= tAVX512F;
420 					if (type_ & tAVX512F) {
421 						if (EBX & (1U << 17)) type_ |= tAVX512DQ;
422 						if (EBX & (1U << 21)) type_ |= tAVX512_IFMA;
423 						if (EBX & (1U << 26)) type_ |= tAVX512PF;
424 						if (EBX & (1U << 27)) type_ |= tAVX512ER;
425 						if (EBX & (1U << 28)) type_ |= tAVX512CD;
426 						if (EBX & (1U << 30)) type_ |= tAVX512BW;
427 						if (EBX & (1U << 31)) type_ |= tAVX512VL;
428 						if (ECX & (1U << 1)) type_ |= tAVX512_VBMI;
429 						if (ECX & (1U << 6)) type_ |= tAVX512_VBMI2;
430 						if (ECX & (1U << 8)) type_ |= tGFNI;
431 						if (ECX & (1U << 9)) type_ |= tVAES;
432 						if (ECX & (1U << 10)) type_ |= tVPCLMULQDQ;
433 						if (ECX & (1U << 11)) type_ |= tAVX512_VNNI;
434 						if (ECX & (1U << 12)) type_ |= tAVX512_BITALG;
435 						if (ECX & (1U << 14)) type_ |= tAVX512_VPOPCNTDQ;
436 						if (EDX & (1U << 2)) type_ |= tAVX512_4VNNIW;
437 						if (EDX & (1U << 3)) type_ |= tAVX512_4FMAPS;
438 						if (EDX & (1U << 8)) type_ |= tAVX512_VP2INTERSECT;
439 					}
440 					// EAX=07H, ECX=1
441 					getCpuidEx(7, 1, data);
442 					if (type_ & tAVX512F) {
443 						if (EAX & (1U << 5)) type_ |= tAVX512_BF16;
444 					}
445 				}
446 			}
447 		}
448 		if (maxNum >= 7) {
449 			getCpuidEx(7, 0, data);
450 			if (type_ & tAVX && (EBX & (1U << 5))) type_ |= tAVX2;
451 			if (EBX & (1U << 3)) type_ |= tBMI1;
452 			if (EBX & (1U << 8)) type_ |= tBMI2;
453 			if (EBX & (1U << 9)) type_ |= tENHANCED_REP;
454 			if (EBX & (1U << 18)) type_ |= tRDSEED;
455 			if (EBX & (1U << 19)) type_ |= tADX;
456 			if (EBX & (1U << 20)) type_ |= tSMAP;
457 			if (EBX & (1U << 4)) type_ |= tHLE;
458 			if (EBX & (1U << 11)) type_ |= tRTM;
459 			if (EBX & (1U << 14)) type_ |= tMPX;
460 			if (EBX & (1U << 29)) type_ |= tSHA;
461 			if (ECX & (1U << 0)) type_ |= tPREFETCHWT1;
462 			if (EDX & (1U << 24)) type_ |= tAMX_TILE;
463 			if (EDX & (1U << 25)) type_ |= tAMX_INT8;
464 			if (EDX & (1U << 22)) type_ |= tAMX_BF16;
465 		}
466 		setFamily();
467 		setNumCores();
468 		setCacheHierarchy();
469 	}
putFamily()470 	void putFamily() const
471 	{
472 		printf("family=%d, model=%X, stepping=%d, extFamily=%d, extModel=%X\n",
473 			family, model, stepping, extFamily, extModel);
474 		printf("display:family=%X, model=%X\n", displayFamily, displayModel);
475 	}
has(Type type)476 	bool has(Type type) const
477 	{
478 		return (type & type_) != 0;
479 	}
480 };
481 
482 class Clock {
483 public:
getRdtsc()484 	static inline uint64 getRdtsc()
485 	{
486 #ifdef XBYAK_INTEL_CPU_SPECIFIC
487 	#ifdef _MSC_VER
488 		return __rdtsc();
489 	#else
490 		unsigned int eax, edx;
491 		__asm__ volatile("rdtsc" : "=a"(eax), "=d"(edx));
492 		return ((uint64)edx << 32) | eax;
493 	#endif
494 #else
495 		// TODO: Need another impl of Clock or rdtsc-equivalent for non-x86 cpu
496 		return 0;
497 #endif
498 	}
Clock()499 	Clock()
500 		: clock_(0)
501 		, count_(0)
502 	{
503 	}
begin()504 	void begin()
505 	{
506 		clock_ -= getRdtsc();
507 	}
end()508 	void end()
509 	{
510 		clock_ += getRdtsc();
511 		count_++;
512 	}
getCount()513 	int getCount() const { return count_; }
getClock()514 	uint64 getClock() const { return clock_; }
clear()515 	void clear() { count_ = 0; clock_ = 0; }
516 private:
517 	uint64 clock_;
518 	int count_;
519 };
520 
521 #ifdef XBYAK64
522 const int UseRCX = 1 << 6;
523 const int UseRDX = 1 << 7;
524 
525 class Pack {
526 	static const size_t maxTblNum = 15;
527 	const Xbyak::Reg64 *tbl_[maxTblNum];
528 	size_t n_;
529 public:
Pack()530 	Pack() : tbl_(), n_(0) {}
Pack(const Xbyak::Reg64 * tbl,size_t n)531 	Pack(const Xbyak::Reg64 *tbl, size_t n) { init(tbl, n); }
Pack(const Pack & rhs)532 	Pack(const Pack& rhs)
533 		: n_(rhs.n_)
534 	{
535 		for (size_t i = 0; i < n_; i++) tbl_[i] = rhs.tbl_[i];
536 	}
537 	Pack& operator=(const Pack& rhs)
538 	{
539 		n_ = rhs.n_;
540 		for (size_t i = 0; i < n_; i++) tbl_[i] = rhs.tbl_[i];
541 		return *this;
542 	}
Pack(const Xbyak::Reg64 & t0)543 	Pack(const Xbyak::Reg64& t0)
544 	{ n_ = 1; tbl_[0] = &t0; }
Pack(const Xbyak::Reg64 & t1,const Xbyak::Reg64 & t0)545 	Pack(const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
546 	{ n_ = 2; tbl_[0] = &t0; tbl_[1] = &t1; }
Pack(const Xbyak::Reg64 & t2,const Xbyak::Reg64 & t1,const Xbyak::Reg64 & t0)547 	Pack(const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
548 	{ n_ = 3; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; }
Pack(const Xbyak::Reg64 & t3,const Xbyak::Reg64 & t2,const Xbyak::Reg64 & t1,const Xbyak::Reg64 & t0)549 	Pack(const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
550 	{ n_ = 4; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; }
Pack(const Xbyak::Reg64 & t4,const Xbyak::Reg64 & t3,const Xbyak::Reg64 & t2,const Xbyak::Reg64 & t1,const Xbyak::Reg64 & t0)551 	Pack(const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
552 	{ n_ = 5; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; }
Pack(const Xbyak::Reg64 & t5,const Xbyak::Reg64 & t4,const Xbyak::Reg64 & t3,const Xbyak::Reg64 & t2,const Xbyak::Reg64 & t1,const Xbyak::Reg64 & t0)553 	Pack(const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
554 	{ n_ = 6; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; }
Pack(const Xbyak::Reg64 & t6,const Xbyak::Reg64 & t5,const Xbyak::Reg64 & t4,const Xbyak::Reg64 & t3,const Xbyak::Reg64 & t2,const Xbyak::Reg64 & t1,const Xbyak::Reg64 & t0)555 	Pack(const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
556 	{ n_ = 7; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; tbl_[6] = &t6; }
Pack(const Xbyak::Reg64 & t7,const Xbyak::Reg64 & t6,const Xbyak::Reg64 & t5,const Xbyak::Reg64 & t4,const Xbyak::Reg64 & t3,const Xbyak::Reg64 & t2,const Xbyak::Reg64 & t1,const Xbyak::Reg64 & t0)557 	Pack(const Xbyak::Reg64& t7, const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
558 	{ n_ = 8; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; tbl_[6] = &t6; tbl_[7] = &t7; }
Pack(const Xbyak::Reg64 & t8,const Xbyak::Reg64 & t7,const Xbyak::Reg64 & t6,const Xbyak::Reg64 & t5,const Xbyak::Reg64 & t4,const Xbyak::Reg64 & t3,const Xbyak::Reg64 & t2,const Xbyak::Reg64 & t1,const Xbyak::Reg64 & t0)559 	Pack(const Xbyak::Reg64& t8, const Xbyak::Reg64& t7, const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
560 	{ n_ = 9; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; tbl_[6] = &t6; tbl_[7] = &t7; tbl_[8] = &t8; }
Pack(const Xbyak::Reg64 & t9,const Xbyak::Reg64 & t8,const Xbyak::Reg64 & t7,const Xbyak::Reg64 & t6,const Xbyak::Reg64 & t5,const Xbyak::Reg64 & t4,const Xbyak::Reg64 & t3,const Xbyak::Reg64 & t2,const Xbyak::Reg64 & t1,const Xbyak::Reg64 & t0)561 	Pack(const Xbyak::Reg64& t9, const Xbyak::Reg64& t8, const Xbyak::Reg64& t7, const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
562 	{ n_ = 10; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; tbl_[6] = &t6; tbl_[7] = &t7; tbl_[8] = &t8; tbl_[9] = &t9; }
append(const Xbyak::Reg64 & t)563 	Pack& append(const Xbyak::Reg64& t)
564 	{
565 		if (n_ == maxTblNum) {
566 			fprintf(stderr, "ERR Pack::can't append\n");
567 			XBYAK_THROW_RET(ERR_BAD_PARAMETER, *this)
568 		}
569 		tbl_[n_++] = &t;
570 		return *this;
571 	}
init(const Xbyak::Reg64 * tbl,size_t n)572 	void init(const Xbyak::Reg64 *tbl, size_t n)
573 	{
574 		if (n > maxTblNum) {
575 			fprintf(stderr, "ERR Pack::init bad n=%d\n", (int)n);
576 			XBYAK_THROW(ERR_BAD_PARAMETER)
577 		}
578 		n_ = n;
579 		for (size_t i = 0; i < n; i++) {
580 			tbl_[i] = &tbl[i];
581 		}
582 	}
583 	const Xbyak::Reg64& operator[](size_t n) const
584 	{
585 		if (n >= n_) {
586 			fprintf(stderr, "ERR Pack bad n=%d(%d)\n", (int)n, (int)n_);
587 			XBYAK_THROW_RET(ERR_BAD_PARAMETER, rax)
588 		}
589 		return *tbl_[n];
590 	}
size()591 	size_t size() const { return n_; }
592 	/*
593 		get tbl[pos, pos + num)
594 	*/
595 	Pack sub(size_t pos, size_t num = size_t(-1)) const
596 	{
597 		if (num == size_t(-1)) num = n_ - pos;
598 		if (pos + num > n_) {
599 			fprintf(stderr, "ERR Pack::sub bad pos=%d, num=%d\n", (int)pos, (int)num);
600 			XBYAK_THROW_RET(ERR_BAD_PARAMETER, Pack())
601 		}
602 		Pack pack;
603 		pack.n_ = num;
604 		for (size_t i = 0; i < num; i++) {
605 			pack.tbl_[i] = tbl_[pos + i];
606 		}
607 		return pack;
608 	}
put()609 	void put() const
610 	{
611 		for (size_t i = 0; i < n_; i++) {
612 			printf("%s ", tbl_[i]->toString());
613 		}
614 		printf("\n");
615 	}
616 };
617 
618 class StackFrame {
619 #ifdef XBYAK64_WIN
620 	static const int noSaveNum = 6;
621 	static const int rcxPos = 0;
622 	static const int rdxPos = 1;
623 #else
624 	static const int noSaveNum = 8;
625 	static const int rcxPos = 3;
626 	static const int rdxPos = 2;
627 #endif
628 	static const int maxRegNum = 14; // maxRegNum = 16 - rsp - rax
629 	Xbyak::CodeGenerator *code_;
630 	int pNum_;
631 	int tNum_;
632 	bool useRcx_;
633 	bool useRdx_;
634 	int saveNum_;
635 	int P_;
636 	bool makeEpilog_;
637 	Xbyak::Reg64 pTbl_[4];
638 	Xbyak::Reg64 tTbl_[maxRegNum];
639 	Pack p_;
640 	Pack t_;
641 	StackFrame(const StackFrame&);
642 	void operator=(const StackFrame&);
643 public:
644 	const Pack& p;
645 	const Pack& t;
646 	/*
647 		make stack frame
648 		@param sf [in] this
649 		@param pNum [in] num of function parameter(0 <= pNum <= 4)
650 		@param tNum [in] num of temporary register(0 <= tNum, with UseRCX, UseRDX) #{pNum + tNum [+rcx] + [rdx]} <= 14
651 		@param stackSizeByte [in] local stack size
652 		@param makeEpilog [in] automatically call close() if true
653 
654 		you can use
655 		rax
656 		gp0, ..., gp(pNum - 1)
657 		gt0, ..., gt(tNum-1)
658 		rcx if tNum & UseRCX
659 		rdx if tNum & UseRDX
660 		rsp[0..stackSizeByte - 1]
661 	*/
662 	StackFrame(Xbyak::CodeGenerator *code, int pNum, int tNum = 0, int stackSizeByte = 0, bool makeEpilog = true)
code_(code)663 		: code_(code)
664 		, pNum_(pNum)
665 		, tNum_(tNum & ~(UseRCX | UseRDX))
666 		, useRcx_((tNum & UseRCX) != 0)
667 		, useRdx_((tNum & UseRDX) != 0)
668 		, saveNum_(0)
669 		, P_(0)
670 		, makeEpilog_(makeEpilog)
671 		, p(p_)
672 		, t(t_)
673 	{
674 		using namespace Xbyak;
675 		if (pNum < 0 || pNum > 4) XBYAK_THROW(ERR_BAD_PNUM)
676 		const int allRegNum = pNum + tNum_ + (useRcx_ ? 1 : 0) + (useRdx_ ? 1 : 0);
677 		if (tNum_ < 0 || allRegNum > maxRegNum) XBYAK_THROW(ERR_BAD_TNUM)
678 		const Reg64& _rsp = code->rsp;
679 		saveNum_ = (std::max)(0, allRegNum - noSaveNum);
680 		const int *tbl = getOrderTbl() + noSaveNum;
681 		for (int i = 0; i < saveNum_; i++) {
682 			code->push(Reg64(tbl[i]));
683 		}
684 		P_ = (stackSizeByte + 7) / 8;
685 		if (P_ > 0 && (P_ & 1) == (saveNum_ & 1)) P_++; // (rsp % 16) == 8, then increment P_ for 16 byte alignment
686 		P_ *= 8;
687 		if (P_ > 0) code->sub(_rsp, P_);
688 		int pos = 0;
689 		for (int i = 0; i < pNum; i++) {
690 			pTbl_[i] = Xbyak::Reg64(getRegIdx(pos));
691 		}
692 		for (int i = 0; i < tNum_; i++) {
693 			tTbl_[i] = Xbyak::Reg64(getRegIdx(pos));
694 		}
695 		if (useRcx_ && rcxPos < pNum) code_->mov(code_->r10, code_->rcx);
696 		if (useRdx_ && rdxPos < pNum) code_->mov(code_->r11, code_->rdx);
697 		p_.init(pTbl_, pNum);
698 		t_.init(tTbl_, tNum_);
699 	}
700 	/*
701 		make epilog manually
702 		@param callRet [in] call ret() if true
703 	*/
704 	void close(bool callRet = true)
705 	{
706 		using namespace Xbyak;
707 		const Reg64& _rsp = code_->rsp;
708 		const int *tbl = getOrderTbl() + noSaveNum;
709 		if (P_ > 0) code_->add(_rsp, P_);
710 		for (int i = 0; i < saveNum_; i++) {
711 			code_->pop(Reg64(tbl[saveNum_ - 1 - i]));
712 		}
713 
714 		if (callRet) code_->ret();
715 	}
~StackFrame()716 	~StackFrame()
717 	{
718 		if (!makeEpilog_) return;
719 		close();
720 	}
721 private:
getOrderTbl()722 	const int *getOrderTbl() const
723 	{
724 		using namespace Xbyak;
725 		static const int tbl[] = {
726 #ifdef XBYAK64_WIN
727 			Operand::RCX, Operand::RDX, Operand::R8, Operand::R9, Operand::R10, Operand::R11, Operand::RDI, Operand::RSI,
728 #else
729 			Operand::RDI, Operand::RSI, Operand::RDX, Operand::RCX, Operand::R8, Operand::R9, Operand::R10, Operand::R11,
730 #endif
731 			Operand::RBX, Operand::RBP, Operand::R12, Operand::R13, Operand::R14, Operand::R15
732 		};
733 		return &tbl[0];
734 	}
getRegIdx(int & pos)735 	int getRegIdx(int& pos) const
736 	{
737 		assert(pos < maxRegNum);
738 		using namespace Xbyak;
739 		const int *tbl = getOrderTbl();
740 		int r = tbl[pos++];
741 		if (useRcx_) {
742 			if (r == Operand::RCX) { return Operand::R10; }
743 			if (r == Operand::R10) { r = tbl[pos++]; }
744 		}
745 		if (useRdx_) {
746 			if (r == Operand::RDX) { return Operand::R11; }
747 			if (r == Operand::R11) { return tbl[pos++]; }
748 		}
749 		return r;
750 	}
751 };
752 #endif
753 
754 class Profiler {
755 	int mode_;
756 	const char *suffix_;
757 	const void *startAddr_;
758 #ifdef XBYAK_USE_PERF
759 	FILE *fp_;
760 #endif
761 public:
762 	enum {
763 		None = 0,
764 		Perf = 1,
765 		VTune = 2
766 	};
Profiler()767 	Profiler()
768 		: mode_(None)
769 		, suffix_("")
770 		, startAddr_(0)
771 #ifdef XBYAK_USE_PERF
772 		, fp_(0)
773 #endif
774 	{
775 	}
776 	// append suffix to funcName
setNameSuffix(const char * suffix)777 	void setNameSuffix(const char *suffix)
778 	{
779 		suffix_ = suffix;
780 	}
setStartAddr(const void * startAddr)781 	void setStartAddr(const void *startAddr)
782 	{
783 		startAddr_ = startAddr;
784 	}
init(int mode)785 	void init(int mode)
786 	{
787 		mode_ = None;
788 		switch (mode) {
789 		default:
790 		case None:
791 			return;
792 		case Perf:
793 #ifdef XBYAK_USE_PERF
794 			close();
795 			{
796 				const int pid = getpid();
797 				char name[128];
798 				snprintf(name, sizeof(name), "/tmp/perf-%d.map", pid);
799 				fp_ = fopen(name, "a+");
800 				if (fp_ == 0) {
801 					fprintf(stderr, "can't open %s\n", name);
802 					return;
803 				}
804 			}
805 			mode_ = Perf;
806 #endif
807 			return;
808 		case VTune:
809 #ifdef XBYAK_USE_VTUNE
810 			dlopen("dummy", RTLD_LAZY); // force to load dlopen to enable jit profiling
811 			if (iJIT_IsProfilingActive() != iJIT_SAMPLING_ON) {
812 				fprintf(stderr, "VTune profiling is not active\n");
813 				return;
814 			}
815 			mode_ = VTune;
816 #endif
817 			return;
818 		}
819 	}
~Profiler()820 	~Profiler()
821 	{
822 		close();
823 	}
close()824 	void close()
825 	{
826 #ifdef XBYAK_USE_PERF
827 		if (fp_ == 0) return;
828 		fclose(fp_);
829 		fp_ = 0;
830 #endif
831 	}
set(const char * funcName,const void * startAddr,size_t funcSize)832 	void set(const char *funcName, const void *startAddr, size_t funcSize) const
833 	{
834 		if (mode_ == None) return;
835 #if !defined(XBYAK_USE_PERF) && !defined(XBYAK_USE_VTUNE)
836 		(void)funcName;
837 		(void)startAddr;
838 		(void)funcSize;
839 #endif
840 #ifdef XBYAK_USE_PERF
841 		if (mode_ == Perf) {
842 			if (fp_ == 0) return;
843 			fprintf(fp_, "%llx %zx %s%s", (long long)startAddr, funcSize, funcName, suffix_);
844 			/*
845 				perf does not recognize the function name which is less than 3,
846 				so append '_' at the end of the name if necessary
847 			*/
848 			size_t n = strlen(funcName) + strlen(suffix_);
849 			for (size_t i = n; i < 3; i++) {
850 				fprintf(fp_, "_");
851 			}
852 			fprintf(fp_, "\n");
853 			fflush(fp_);
854 		}
855 #endif
856 #ifdef XBYAK_USE_VTUNE
857 		if (mode_ != VTune) return;
858 		char className[] = "";
859 		char fileName[] = "";
860 		iJIT_Method_Load jmethod = {};
861 		jmethod.method_id = iJIT_GetNewMethodID();
862 		jmethod.class_file_name = className;
863 		jmethod.source_file_name = fileName;
864 		jmethod.method_load_address = const_cast<void*>(startAddr);
865 		jmethod.method_size = funcSize;
866 		jmethod.line_number_size = 0;
867 		char buf[128];
868 		snprintf(buf, sizeof(buf), "%s%s", funcName, suffix_);
869 		jmethod.method_name = buf;
870 		iJIT_NotifyEvent(iJVM_EVENT_TYPE_METHOD_LOAD_FINISHED, (void*)&jmethod);
871 #endif
872 	}
873 	/*
874 		for continuous set
875 		funcSize = endAddr - <previous set endAddr>
876 	*/
set(const char * funcName,const void * endAddr)877 	void set(const char *funcName, const void *endAddr)
878 	{
879 		set(funcName, startAddr_, (size_t)endAddr - (size_t)startAddr_);
880 		startAddr_ = endAddr;
881 	}
882 };
883 
884 } } // end of util
885 #endif
886