1 #ifndef XBYAK_XBYAK_UTIL_H_ 2 #define XBYAK_XBYAK_UTIL_H_ 3 #include <string.h> 4 5 /** 6 utility class and functions for Xbyak 7 Xbyak::util::Clock ; rdtsc timer 8 Xbyak::util::Cpu ; detect CPU 9 @note this header is UNDER CONSTRUCTION! 10 */ 11 #include "xbyak.h" 12 13 #if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64) 14 #define XBYAK_INTEL_CPU_SPECIFIC 15 #endif 16 17 #ifdef XBYAK_INTEL_CPU_SPECIFIC 18 #ifdef _MSC_VER 19 #if (_MSC_VER < 1400) && defined(XBYAK32) __cpuid(int[4],int)20 static inline __declspec(naked) void __cpuid(int[4], int) 21 { 22 __asm { 23 push ebx 24 push esi 25 mov eax, dword ptr [esp + 4 * 2 + 8] // eaxIn 26 cpuid 27 mov esi, dword ptr [esp + 4 * 2 + 4] // data 28 mov dword ptr [esi], eax 29 mov dword ptr [esi + 4], ebx 30 mov dword ptr [esi + 8], ecx 31 mov dword ptr [esi + 12], edx 32 pop esi 33 pop ebx 34 ret 35 } 36 } 37 #else 38 #include <intrin.h> // for __cpuid 39 #endif 40 #else 41 #ifndef __GNUC_PREREQ 42 #define __GNUC_PREREQ(major, minor) ((((__GNUC__) << 16) + (__GNUC_MINOR__)) >= (((major) << 16) + (minor))) 43 #endif 44 #if __GNUC_PREREQ(4, 3) && !defined(__APPLE__) 45 #include <cpuid.h> 46 #else 47 #if defined(__APPLE__) && defined(XBYAK32) // avoid err : can't find a register in class `BREG' while reloading `asm' 48 #define __cpuid(eaxIn, a, b, c, d) __asm__ __volatile__("pushl %%ebx\ncpuid\nmovl %%ebp, %%esi\npopl %%ebx" : "=a"(a), "=S"(b), "=c"(c), "=d"(d) : "0"(eaxIn)) 49 #define __cpuid_count(eaxIn, ecxIn, a, b, c, d) __asm__ __volatile__("pushl %%ebx\ncpuid\nmovl %%ebp, %%esi\npopl %%ebx" : "=a"(a), "=S"(b), "=c"(c), "=d"(d) : "0"(eaxIn), "2"(ecxIn)) 50 #else 51 #define __cpuid(eaxIn, a, b, c, d) __asm__ __volatile__("cpuid\n" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "0"(eaxIn)) 52 #define __cpuid_count(eaxIn, ecxIn, a, b, c, d) __asm__ __volatile__("cpuid\n" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "0"(eaxIn), "2"(ecxIn)) 53 #endif 54 #endif 55 #endif 56 #endif 57 58 #ifdef XBYAK_USE_VTUNE 59 // -I /opt/intel/vtune_amplifier/include/ -L /opt/intel/vtune_amplifier/lib64 -ljitprofiling -ldl 60 #include <jitprofiling.h> 61 #ifdef _MSC_VER 62 #pragma comment(lib, "libittnotify.lib") 63 #endif 64 #ifdef __linux__ 65 #include <dlfcn.h> 66 #endif 67 #endif 68 #ifdef __linux__ 69 #define XBYAK_USE_PERF 70 #endif 71 72 namespace Xbyak { namespace util { 73 74 typedef enum { 75 SmtLevel = 1, 76 CoreLevel = 2 77 } IntelCpuTopologyLevel; 78 79 /** 80 CPU detection class 81 */ 82 class Cpu { 83 uint64 type_; 84 //system topology 85 bool x2APIC_supported_; 86 static const size_t maxTopologyLevels = 2; 87 unsigned int numCores_[maxTopologyLevels]; 88 89 static const unsigned int maxNumberCacheLevels = 10; 90 unsigned int dataCacheSize_[maxNumberCacheLevels]; 91 unsigned int coresSharignDataCache_[maxNumberCacheLevels]; 92 unsigned int dataCacheLevels_; 93 get32bitAsBE(const char * x)94 unsigned int get32bitAsBE(const char *x) const 95 { 96 return x[0] | (x[1] << 8) | (x[2] << 16) | (x[3] << 24); 97 } mask(int n)98 unsigned int mask(int n) const 99 { 100 return (1U << n) - 1; 101 } setFamily()102 void setFamily() 103 { 104 unsigned int data[4] = {}; 105 getCpuid(1, data); 106 stepping = data[0] & mask(4); 107 model = (data[0] >> 4) & mask(4); 108 family = (data[0] >> 8) & mask(4); 109 // type = (data[0] >> 12) & mask(2); 110 extModel = (data[0] >> 16) & mask(4); 111 extFamily = (data[0] >> 20) & mask(8); 112 if (family == 0x0f) { 113 displayFamily = family + extFamily; 114 } else { 115 displayFamily = family; 116 } 117 if (family == 6 || family == 0x0f) { 118 displayModel = (extModel << 4) + model; 119 } else { 120 displayModel = model; 121 } 122 } extractBit(unsigned int val,unsigned int base,unsigned int end)123 unsigned int extractBit(unsigned int val, unsigned int base, unsigned int end) 124 { 125 return (val >> base) & ((1u << (end - base)) - 1); 126 } setNumCores()127 void setNumCores() 128 { 129 if ((type_ & tINTEL) == 0) return; 130 131 unsigned int data[4] = {}; 132 133 /* CAUTION: These numbers are configuration as shipped by Intel. */ 134 getCpuidEx(0x0, 0, data); 135 if (data[0] >= 0xB) { 136 /* 137 if leaf 11 exists(x2APIC is supported), 138 we use it to get the number of smt cores and cores on socket 139 140 leaf 0xB can be zeroed-out by a hypervisor 141 */ 142 x2APIC_supported_ = true; 143 for (unsigned int i = 0; i < maxTopologyLevels; i++) { 144 getCpuidEx(0xB, i, data); 145 IntelCpuTopologyLevel level = (IntelCpuTopologyLevel)extractBit(data[2], 8, 15); 146 if (level == SmtLevel || level == CoreLevel) { 147 numCores_[level - 1] = extractBit(data[1], 0, 15); 148 } 149 } 150 /* 151 Fallback values in case a hypervisor has 0xB leaf zeroed-out. 152 */ 153 numCores_[SmtLevel - 1] = (std::max)(1u, numCores_[SmtLevel - 1]); 154 numCores_[CoreLevel - 1] = (std::max)(numCores_[SmtLevel - 1], numCores_[CoreLevel - 1]); 155 } else { 156 /* 157 Failed to deremine num of cores without x2APIC support. 158 TODO: USE initial APIC ID to determine ncores. 159 */ 160 numCores_[SmtLevel - 1] = 0; 161 numCores_[CoreLevel - 1] = 0; 162 } 163 164 } setCacheHierarchy()165 void setCacheHierarchy() 166 { 167 if ((type_ & tINTEL) == 0) return; 168 const unsigned int NO_CACHE = 0; 169 const unsigned int DATA_CACHE = 1; 170 // const unsigned int INSTRUCTION_CACHE = 2; 171 const unsigned int UNIFIED_CACHE = 3; 172 unsigned int smt_width = 0; 173 unsigned int logical_cores = 0; 174 unsigned int data[4] = {}; 175 176 if (x2APIC_supported_) { 177 smt_width = numCores_[0]; 178 logical_cores = numCores_[1]; 179 } 180 181 /* 182 Assumptions: 183 the first level of data cache is not shared (which is the 184 case for every existing architecture) and use this to 185 determine the SMT width for arch not supporting leaf 11. 186 when leaf 4 reports a number of core less than numCores_ 187 on socket reported by leaf 11, then it is a correct number 188 of cores not an upperbound. 189 */ 190 for (int i = 0; dataCacheLevels_ < maxNumberCacheLevels; i++) { 191 getCpuidEx(0x4, i, data); 192 unsigned int cacheType = extractBit(data[0], 0, 4); 193 if (cacheType == NO_CACHE) break; 194 if (cacheType == DATA_CACHE || cacheType == UNIFIED_CACHE) { 195 unsigned int actual_logical_cores = extractBit(data[0], 14, 25) + 1; 196 if (logical_cores != 0) { // true only if leaf 0xB is supported and valid 197 actual_logical_cores = (std::min)(actual_logical_cores, logical_cores); 198 } 199 assert(actual_logical_cores != 0); 200 dataCacheSize_[dataCacheLevels_] = 201 (extractBit(data[1], 22, 31) + 1) 202 * (extractBit(data[1], 12, 21) + 1) 203 * (extractBit(data[1], 0, 11) + 1) 204 * (data[2] + 1); 205 if (cacheType == DATA_CACHE && smt_width == 0) smt_width = actual_logical_cores; 206 assert(smt_width != 0); 207 coresSharignDataCache_[dataCacheLevels_] = (std::max)(actual_logical_cores / smt_width, 1u); 208 dataCacheLevels_++; 209 } 210 } 211 } 212 213 public: 214 int model; 215 int family; 216 int stepping; 217 int extModel; 218 int extFamily; 219 int displayFamily; // family + extFamily 220 int displayModel; // model + extModel 221 getNumCores(IntelCpuTopologyLevel level)222 unsigned int getNumCores(IntelCpuTopologyLevel level) const { 223 if (!x2APIC_supported_) XBYAK_THROW_RET(ERR_X2APIC_IS_NOT_SUPPORTED, 0) 224 switch (level) { 225 case SmtLevel: return numCores_[level - 1]; 226 case CoreLevel: return numCores_[level - 1] / numCores_[SmtLevel - 1]; 227 default: XBYAK_THROW_RET(ERR_X2APIC_IS_NOT_SUPPORTED, 0) 228 } 229 } 230 getDataCacheLevels()231 unsigned int getDataCacheLevels() const { return dataCacheLevels_; } getCoresSharingDataCache(unsigned int i)232 unsigned int getCoresSharingDataCache(unsigned int i) const 233 { 234 if (i >= dataCacheLevels_) XBYAK_THROW_RET(ERR_BAD_PARAMETER, 0) 235 return coresSharignDataCache_[i]; 236 } getDataCacheSize(unsigned int i)237 unsigned int getDataCacheSize(unsigned int i) const 238 { 239 if (i >= dataCacheLevels_) XBYAK_THROW_RET(ERR_BAD_PARAMETER, 0) 240 return dataCacheSize_[i]; 241 } 242 243 /* 244 data[] = { eax, ebx, ecx, edx } 245 */ getCpuid(unsigned int eaxIn,unsigned int data[4])246 static inline void getCpuid(unsigned int eaxIn, unsigned int data[4]) 247 { 248 #ifdef XBYAK_INTEL_CPU_SPECIFIC 249 #ifdef _MSC_VER 250 __cpuid(reinterpret_cast<int*>(data), eaxIn); 251 #else 252 __cpuid(eaxIn, data[0], data[1], data[2], data[3]); 253 #endif 254 #else 255 (void)eaxIn; 256 (void)data; 257 #endif 258 } getCpuidEx(unsigned int eaxIn,unsigned int ecxIn,unsigned int data[4])259 static inline void getCpuidEx(unsigned int eaxIn, unsigned int ecxIn, unsigned int data[4]) 260 { 261 #ifdef XBYAK_INTEL_CPU_SPECIFIC 262 #ifdef _MSC_VER 263 __cpuidex(reinterpret_cast<int*>(data), eaxIn, ecxIn); 264 #else 265 __cpuid_count(eaxIn, ecxIn, data[0], data[1], data[2], data[3]); 266 #endif 267 #else 268 (void)eaxIn; 269 (void)ecxIn; 270 (void)data; 271 #endif 272 } getXfeature()273 static inline uint64 getXfeature() 274 { 275 #ifdef XBYAK_INTEL_CPU_SPECIFIC 276 #ifdef _MSC_VER 277 return _xgetbv(0); 278 #else 279 unsigned int eax, edx; 280 // xgetvb is not support on gcc 4.2 281 // __asm__ volatile("xgetbv" : "=a"(eax), "=d"(edx) : "c"(0)); 282 __asm__ volatile(".byte 0x0f, 0x01, 0xd0" : "=a"(eax), "=d"(edx) : "c"(0)); 283 return ((uint64)edx << 32) | eax; 284 #endif 285 #else 286 return 0; 287 #endif 288 } 289 typedef uint64 Type; 290 291 static const Type NONE = 0; 292 static const Type tMMX = 1 << 0; 293 static const Type tMMX2 = 1 << 1; 294 static const Type tCMOV = 1 << 2; 295 static const Type tSSE = 1 << 3; 296 static const Type tSSE2 = 1 << 4; 297 static const Type tSSE3 = 1 << 5; 298 static const Type tSSSE3 = 1 << 6; 299 static const Type tSSE41 = 1 << 7; 300 static const Type tSSE42 = 1 << 8; 301 static const Type tPOPCNT = 1 << 9; 302 static const Type tAESNI = 1 << 10; 303 static const Type tSSE5 = 1 << 11; 304 static const Type tOSXSAVE = 1 << 12; 305 static const Type tPCLMULQDQ = 1 << 13; 306 static const Type tAVX = 1 << 14; 307 static const Type tFMA = 1 << 15; 308 309 static const Type t3DN = 1 << 16; 310 static const Type tE3DN = 1 << 17; 311 static const Type tSSE4a = 1 << 18; 312 static const Type tRDTSCP = 1 << 19; 313 static const Type tAVX2 = 1 << 20; 314 static const Type tBMI1 = 1 << 21; // andn, bextr, blsi, blsmsk, blsr, tzcnt 315 static const Type tBMI2 = 1 << 22; // bzhi, mulx, pdep, pext, rorx, sarx, shlx, shrx 316 static const Type tLZCNT = 1 << 23; 317 318 static const Type tINTEL = 1 << 24; 319 static const Type tAMD = 1 << 25; 320 321 static const Type tENHANCED_REP = 1 << 26; // enhanced rep movsb/stosb 322 static const Type tRDRAND = 1 << 27; 323 static const Type tADX = 1 << 28; // adcx, adox 324 static const Type tRDSEED = 1 << 29; // rdseed 325 static const Type tSMAP = 1 << 30; // stac 326 static const Type tHLE = uint64(1) << 31; // xacquire, xrelease, xtest 327 static const Type tRTM = uint64(1) << 32; // xbegin, xend, xabort 328 static const Type tF16C = uint64(1) << 33; // vcvtph2ps, vcvtps2ph 329 static const Type tMOVBE = uint64(1) << 34; // mobve 330 static const Type tAVX512F = uint64(1) << 35; 331 static const Type tAVX512DQ = uint64(1) << 36; 332 static const Type tAVX512_IFMA = uint64(1) << 37; 333 static const Type tAVX512IFMA = tAVX512_IFMA; 334 static const Type tAVX512PF = uint64(1) << 38; 335 static const Type tAVX512ER = uint64(1) << 39; 336 static const Type tAVX512CD = uint64(1) << 40; 337 static const Type tAVX512BW = uint64(1) << 41; 338 static const Type tAVX512VL = uint64(1) << 42; 339 static const Type tAVX512_VBMI = uint64(1) << 43; 340 static const Type tAVX512VBMI = tAVX512_VBMI; // changed by Intel's manual 341 static const Type tAVX512_4VNNIW = uint64(1) << 44; 342 static const Type tAVX512_4FMAPS = uint64(1) << 45; 343 static const Type tPREFETCHWT1 = uint64(1) << 46; 344 static const Type tPREFETCHW = uint64(1) << 47; 345 static const Type tSHA = uint64(1) << 48; 346 static const Type tMPX = uint64(1) << 49; 347 static const Type tAVX512_VBMI2 = uint64(1) << 50; 348 static const Type tGFNI = uint64(1) << 51; 349 static const Type tVAES = uint64(1) << 52; 350 static const Type tVPCLMULQDQ = uint64(1) << 53; 351 static const Type tAVX512_VNNI = uint64(1) << 54; 352 static const Type tAVX512_BITALG = uint64(1) << 55; 353 static const Type tAVX512_VPOPCNTDQ = uint64(1) << 56; 354 static const Type tAVX512_BF16 = uint64(1) << 57; 355 static const Type tAVX512_VP2INTERSECT = uint64(1) << 58; 356 static const Type tAMX_TILE = uint64(1) << 59; 357 static const Type tAMX_INT8 = uint64(1) << 60; 358 static const Type tAMX_BF16 = uint64(1) << 61; 359 Cpu()360 Cpu() 361 : type_(NONE) 362 , x2APIC_supported_(false) 363 , numCores_() 364 , dataCacheSize_() 365 , coresSharignDataCache_() 366 , dataCacheLevels_(0) 367 { 368 unsigned int data[4] = {}; 369 const unsigned int& EAX = data[0]; 370 const unsigned int& EBX = data[1]; 371 const unsigned int& ECX = data[2]; 372 const unsigned int& EDX = data[3]; 373 getCpuid(0, data); 374 const unsigned int maxNum = EAX; 375 static const char intel[] = "ntel"; 376 static const char amd[] = "cAMD"; 377 if (ECX == get32bitAsBE(amd)) { 378 type_ |= tAMD; 379 getCpuid(0x80000001, data); 380 if (EDX & (1U << 31)) type_ |= t3DN; 381 if (EDX & (1U << 15)) type_ |= tCMOV; 382 if (EDX & (1U << 30)) type_ |= tE3DN; 383 if (EDX & (1U << 22)) type_ |= tMMX2; 384 if (EDX & (1U << 27)) type_ |= tRDTSCP; 385 } 386 if (ECX == get32bitAsBE(intel)) { 387 type_ |= tINTEL; 388 getCpuid(0x80000001, data); 389 if (EDX & (1U << 27)) type_ |= tRDTSCP; 390 if (ECX & (1U << 5)) type_ |= tLZCNT; 391 if (ECX & (1U << 8)) type_ |= tPREFETCHW; 392 } 393 getCpuid(1, data); 394 if (ECX & (1U << 0)) type_ |= tSSE3; 395 if (ECX & (1U << 9)) type_ |= tSSSE3; 396 if (ECX & (1U << 19)) type_ |= tSSE41; 397 if (ECX & (1U << 20)) type_ |= tSSE42; 398 if (ECX & (1U << 22)) type_ |= tMOVBE; 399 if (ECX & (1U << 23)) type_ |= tPOPCNT; 400 if (ECX & (1U << 25)) type_ |= tAESNI; 401 if (ECX & (1U << 1)) type_ |= tPCLMULQDQ; 402 if (ECX & (1U << 27)) type_ |= tOSXSAVE; 403 if (ECX & (1U << 30)) type_ |= tRDRAND; 404 if (ECX & (1U << 29)) type_ |= tF16C; 405 406 if (EDX & (1U << 15)) type_ |= tCMOV; 407 if (EDX & (1U << 23)) type_ |= tMMX; 408 if (EDX & (1U << 25)) type_ |= tMMX2 | tSSE; 409 if (EDX & (1U << 26)) type_ |= tSSE2; 410 411 if (type_ & tOSXSAVE) { 412 // check XFEATURE_ENABLED_MASK[2:1] = '11b' 413 uint64 bv = getXfeature(); 414 if ((bv & 6) == 6) { 415 if (ECX & (1U << 28)) type_ |= tAVX; 416 if (ECX & (1U << 12)) type_ |= tFMA; 417 if (((bv >> 5) & 7) == 7) { 418 getCpuidEx(7, 0, data); 419 if (EBX & (1U << 16)) type_ |= tAVX512F; 420 if (type_ & tAVX512F) { 421 if (EBX & (1U << 17)) type_ |= tAVX512DQ; 422 if (EBX & (1U << 21)) type_ |= tAVX512_IFMA; 423 if (EBX & (1U << 26)) type_ |= tAVX512PF; 424 if (EBX & (1U << 27)) type_ |= tAVX512ER; 425 if (EBX & (1U << 28)) type_ |= tAVX512CD; 426 if (EBX & (1U << 30)) type_ |= tAVX512BW; 427 if (EBX & (1U << 31)) type_ |= tAVX512VL; 428 if (ECX & (1U << 1)) type_ |= tAVX512_VBMI; 429 if (ECX & (1U << 6)) type_ |= tAVX512_VBMI2; 430 if (ECX & (1U << 8)) type_ |= tGFNI; 431 if (ECX & (1U << 9)) type_ |= tVAES; 432 if (ECX & (1U << 10)) type_ |= tVPCLMULQDQ; 433 if (ECX & (1U << 11)) type_ |= tAVX512_VNNI; 434 if (ECX & (1U << 12)) type_ |= tAVX512_BITALG; 435 if (ECX & (1U << 14)) type_ |= tAVX512_VPOPCNTDQ; 436 if (EDX & (1U << 2)) type_ |= tAVX512_4VNNIW; 437 if (EDX & (1U << 3)) type_ |= tAVX512_4FMAPS; 438 if (EDX & (1U << 8)) type_ |= tAVX512_VP2INTERSECT; 439 } 440 // EAX=07H, ECX=1 441 getCpuidEx(7, 1, data); 442 if (type_ & tAVX512F) { 443 if (EAX & (1U << 5)) type_ |= tAVX512_BF16; 444 } 445 } 446 } 447 } 448 if (maxNum >= 7) { 449 getCpuidEx(7, 0, data); 450 if (type_ & tAVX && (EBX & (1U << 5))) type_ |= tAVX2; 451 if (EBX & (1U << 3)) type_ |= tBMI1; 452 if (EBX & (1U << 8)) type_ |= tBMI2; 453 if (EBX & (1U << 9)) type_ |= tENHANCED_REP; 454 if (EBX & (1U << 18)) type_ |= tRDSEED; 455 if (EBX & (1U << 19)) type_ |= tADX; 456 if (EBX & (1U << 20)) type_ |= tSMAP; 457 if (EBX & (1U << 4)) type_ |= tHLE; 458 if (EBX & (1U << 11)) type_ |= tRTM; 459 if (EBX & (1U << 14)) type_ |= tMPX; 460 if (EBX & (1U << 29)) type_ |= tSHA; 461 if (ECX & (1U << 0)) type_ |= tPREFETCHWT1; 462 if (EDX & (1U << 24)) type_ |= tAMX_TILE; 463 if (EDX & (1U << 25)) type_ |= tAMX_INT8; 464 if (EDX & (1U << 22)) type_ |= tAMX_BF16; 465 } 466 setFamily(); 467 setNumCores(); 468 setCacheHierarchy(); 469 } putFamily()470 void putFamily() const 471 { 472 printf("family=%d, model=%X, stepping=%d, extFamily=%d, extModel=%X\n", 473 family, model, stepping, extFamily, extModel); 474 printf("display:family=%X, model=%X\n", displayFamily, displayModel); 475 } has(Type type)476 bool has(Type type) const 477 { 478 return (type & type_) != 0; 479 } 480 }; 481 482 class Clock { 483 public: getRdtsc()484 static inline uint64 getRdtsc() 485 { 486 #ifdef XBYAK_INTEL_CPU_SPECIFIC 487 #ifdef _MSC_VER 488 return __rdtsc(); 489 #else 490 unsigned int eax, edx; 491 __asm__ volatile("rdtsc" : "=a"(eax), "=d"(edx)); 492 return ((uint64)edx << 32) | eax; 493 #endif 494 #else 495 // TODO: Need another impl of Clock or rdtsc-equivalent for non-x86 cpu 496 return 0; 497 #endif 498 } Clock()499 Clock() 500 : clock_(0) 501 , count_(0) 502 { 503 } begin()504 void begin() 505 { 506 clock_ -= getRdtsc(); 507 } end()508 void end() 509 { 510 clock_ += getRdtsc(); 511 count_++; 512 } getCount()513 int getCount() const { return count_; } getClock()514 uint64 getClock() const { return clock_; } clear()515 void clear() { count_ = 0; clock_ = 0; } 516 private: 517 uint64 clock_; 518 int count_; 519 }; 520 521 #ifdef XBYAK64 522 const int UseRCX = 1 << 6; 523 const int UseRDX = 1 << 7; 524 525 class Pack { 526 static const size_t maxTblNum = 15; 527 const Xbyak::Reg64 *tbl_[maxTblNum]; 528 size_t n_; 529 public: Pack()530 Pack() : tbl_(), n_(0) {} Pack(const Xbyak::Reg64 * tbl,size_t n)531 Pack(const Xbyak::Reg64 *tbl, size_t n) { init(tbl, n); } Pack(const Pack & rhs)532 Pack(const Pack& rhs) 533 : n_(rhs.n_) 534 { 535 for (size_t i = 0; i < n_; i++) tbl_[i] = rhs.tbl_[i]; 536 } 537 Pack& operator=(const Pack& rhs) 538 { 539 n_ = rhs.n_; 540 for (size_t i = 0; i < n_; i++) tbl_[i] = rhs.tbl_[i]; 541 return *this; 542 } Pack(const Xbyak::Reg64 & t0)543 Pack(const Xbyak::Reg64& t0) 544 { n_ = 1; tbl_[0] = &t0; } Pack(const Xbyak::Reg64 & t1,const Xbyak::Reg64 & t0)545 Pack(const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) 546 { n_ = 2; tbl_[0] = &t0; tbl_[1] = &t1; } Pack(const Xbyak::Reg64 & t2,const Xbyak::Reg64 & t1,const Xbyak::Reg64 & t0)547 Pack(const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) 548 { n_ = 3; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; } Pack(const Xbyak::Reg64 & t3,const Xbyak::Reg64 & t2,const Xbyak::Reg64 & t1,const Xbyak::Reg64 & t0)549 Pack(const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) 550 { n_ = 4; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; } Pack(const Xbyak::Reg64 & t4,const Xbyak::Reg64 & t3,const Xbyak::Reg64 & t2,const Xbyak::Reg64 & t1,const Xbyak::Reg64 & t0)551 Pack(const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) 552 { n_ = 5; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; } Pack(const Xbyak::Reg64 & t5,const Xbyak::Reg64 & t4,const Xbyak::Reg64 & t3,const Xbyak::Reg64 & t2,const Xbyak::Reg64 & t1,const Xbyak::Reg64 & t0)553 Pack(const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) 554 { n_ = 6; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; } Pack(const Xbyak::Reg64 & t6,const Xbyak::Reg64 & t5,const Xbyak::Reg64 & t4,const Xbyak::Reg64 & t3,const Xbyak::Reg64 & t2,const Xbyak::Reg64 & t1,const Xbyak::Reg64 & t0)555 Pack(const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) 556 { n_ = 7; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; tbl_[6] = &t6; } Pack(const Xbyak::Reg64 & t7,const Xbyak::Reg64 & t6,const Xbyak::Reg64 & t5,const Xbyak::Reg64 & t4,const Xbyak::Reg64 & t3,const Xbyak::Reg64 & t2,const Xbyak::Reg64 & t1,const Xbyak::Reg64 & t0)557 Pack(const Xbyak::Reg64& t7, const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) 558 { n_ = 8; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; tbl_[6] = &t6; tbl_[7] = &t7; } Pack(const Xbyak::Reg64 & t8,const Xbyak::Reg64 & t7,const Xbyak::Reg64 & t6,const Xbyak::Reg64 & t5,const Xbyak::Reg64 & t4,const Xbyak::Reg64 & t3,const Xbyak::Reg64 & t2,const Xbyak::Reg64 & t1,const Xbyak::Reg64 & t0)559 Pack(const Xbyak::Reg64& t8, const Xbyak::Reg64& t7, const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) 560 { n_ = 9; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; tbl_[6] = &t6; tbl_[7] = &t7; tbl_[8] = &t8; } Pack(const Xbyak::Reg64 & t9,const Xbyak::Reg64 & t8,const Xbyak::Reg64 & t7,const Xbyak::Reg64 & t6,const Xbyak::Reg64 & t5,const Xbyak::Reg64 & t4,const Xbyak::Reg64 & t3,const Xbyak::Reg64 & t2,const Xbyak::Reg64 & t1,const Xbyak::Reg64 & t0)561 Pack(const Xbyak::Reg64& t9, const Xbyak::Reg64& t8, const Xbyak::Reg64& t7, const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) 562 { n_ = 10; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; tbl_[6] = &t6; tbl_[7] = &t7; tbl_[8] = &t8; tbl_[9] = &t9; } append(const Xbyak::Reg64 & t)563 Pack& append(const Xbyak::Reg64& t) 564 { 565 if (n_ == maxTblNum) { 566 fprintf(stderr, "ERR Pack::can't append\n"); 567 XBYAK_THROW_RET(ERR_BAD_PARAMETER, *this) 568 } 569 tbl_[n_++] = &t; 570 return *this; 571 } init(const Xbyak::Reg64 * tbl,size_t n)572 void init(const Xbyak::Reg64 *tbl, size_t n) 573 { 574 if (n > maxTblNum) { 575 fprintf(stderr, "ERR Pack::init bad n=%d\n", (int)n); 576 XBYAK_THROW(ERR_BAD_PARAMETER) 577 } 578 n_ = n; 579 for (size_t i = 0; i < n; i++) { 580 tbl_[i] = &tbl[i]; 581 } 582 } 583 const Xbyak::Reg64& operator[](size_t n) const 584 { 585 if (n >= n_) { 586 fprintf(stderr, "ERR Pack bad n=%d(%d)\n", (int)n, (int)n_); 587 XBYAK_THROW_RET(ERR_BAD_PARAMETER, rax) 588 } 589 return *tbl_[n]; 590 } size()591 size_t size() const { return n_; } 592 /* 593 get tbl[pos, pos + num) 594 */ 595 Pack sub(size_t pos, size_t num = size_t(-1)) const 596 { 597 if (num == size_t(-1)) num = n_ - pos; 598 if (pos + num > n_) { 599 fprintf(stderr, "ERR Pack::sub bad pos=%d, num=%d\n", (int)pos, (int)num); 600 XBYAK_THROW_RET(ERR_BAD_PARAMETER, Pack()) 601 } 602 Pack pack; 603 pack.n_ = num; 604 for (size_t i = 0; i < num; i++) { 605 pack.tbl_[i] = tbl_[pos + i]; 606 } 607 return pack; 608 } put()609 void put() const 610 { 611 for (size_t i = 0; i < n_; i++) { 612 printf("%s ", tbl_[i]->toString()); 613 } 614 printf("\n"); 615 } 616 }; 617 618 class StackFrame { 619 #ifdef XBYAK64_WIN 620 static const int noSaveNum = 6; 621 static const int rcxPos = 0; 622 static const int rdxPos = 1; 623 #else 624 static const int noSaveNum = 8; 625 static const int rcxPos = 3; 626 static const int rdxPos = 2; 627 #endif 628 static const int maxRegNum = 14; // maxRegNum = 16 - rsp - rax 629 Xbyak::CodeGenerator *code_; 630 int pNum_; 631 int tNum_; 632 bool useRcx_; 633 bool useRdx_; 634 int saveNum_; 635 int P_; 636 bool makeEpilog_; 637 Xbyak::Reg64 pTbl_[4]; 638 Xbyak::Reg64 tTbl_[maxRegNum]; 639 Pack p_; 640 Pack t_; 641 StackFrame(const StackFrame&); 642 void operator=(const StackFrame&); 643 public: 644 const Pack& p; 645 const Pack& t; 646 /* 647 make stack frame 648 @param sf [in] this 649 @param pNum [in] num of function parameter(0 <= pNum <= 4) 650 @param tNum [in] num of temporary register(0 <= tNum, with UseRCX, UseRDX) #{pNum + tNum [+rcx] + [rdx]} <= 14 651 @param stackSizeByte [in] local stack size 652 @param makeEpilog [in] automatically call close() if true 653 654 you can use 655 rax 656 gp0, ..., gp(pNum - 1) 657 gt0, ..., gt(tNum-1) 658 rcx if tNum & UseRCX 659 rdx if tNum & UseRDX 660 rsp[0..stackSizeByte - 1] 661 */ 662 StackFrame(Xbyak::CodeGenerator *code, int pNum, int tNum = 0, int stackSizeByte = 0, bool makeEpilog = true) code_(code)663 : code_(code) 664 , pNum_(pNum) 665 , tNum_(tNum & ~(UseRCX | UseRDX)) 666 , useRcx_((tNum & UseRCX) != 0) 667 , useRdx_((tNum & UseRDX) != 0) 668 , saveNum_(0) 669 , P_(0) 670 , makeEpilog_(makeEpilog) 671 , p(p_) 672 , t(t_) 673 { 674 using namespace Xbyak; 675 if (pNum < 0 || pNum > 4) XBYAK_THROW(ERR_BAD_PNUM) 676 const int allRegNum = pNum + tNum_ + (useRcx_ ? 1 : 0) + (useRdx_ ? 1 : 0); 677 if (tNum_ < 0 || allRegNum > maxRegNum) XBYAK_THROW(ERR_BAD_TNUM) 678 const Reg64& _rsp = code->rsp; 679 saveNum_ = (std::max)(0, allRegNum - noSaveNum); 680 const int *tbl = getOrderTbl() + noSaveNum; 681 for (int i = 0; i < saveNum_; i++) { 682 code->push(Reg64(tbl[i])); 683 } 684 P_ = (stackSizeByte + 7) / 8; 685 if (P_ > 0 && (P_ & 1) == (saveNum_ & 1)) P_++; // (rsp % 16) == 8, then increment P_ for 16 byte alignment 686 P_ *= 8; 687 if (P_ > 0) code->sub(_rsp, P_); 688 int pos = 0; 689 for (int i = 0; i < pNum; i++) { 690 pTbl_[i] = Xbyak::Reg64(getRegIdx(pos)); 691 } 692 for (int i = 0; i < tNum_; i++) { 693 tTbl_[i] = Xbyak::Reg64(getRegIdx(pos)); 694 } 695 if (useRcx_ && rcxPos < pNum) code_->mov(code_->r10, code_->rcx); 696 if (useRdx_ && rdxPos < pNum) code_->mov(code_->r11, code_->rdx); 697 p_.init(pTbl_, pNum); 698 t_.init(tTbl_, tNum_); 699 } 700 /* 701 make epilog manually 702 @param callRet [in] call ret() if true 703 */ 704 void close(bool callRet = true) 705 { 706 using namespace Xbyak; 707 const Reg64& _rsp = code_->rsp; 708 const int *tbl = getOrderTbl() + noSaveNum; 709 if (P_ > 0) code_->add(_rsp, P_); 710 for (int i = 0; i < saveNum_; i++) { 711 code_->pop(Reg64(tbl[saveNum_ - 1 - i])); 712 } 713 714 if (callRet) code_->ret(); 715 } ~StackFrame()716 ~StackFrame() 717 { 718 if (!makeEpilog_) return; 719 close(); 720 } 721 private: getOrderTbl()722 const int *getOrderTbl() const 723 { 724 using namespace Xbyak; 725 static const int tbl[] = { 726 #ifdef XBYAK64_WIN 727 Operand::RCX, Operand::RDX, Operand::R8, Operand::R9, Operand::R10, Operand::R11, Operand::RDI, Operand::RSI, 728 #else 729 Operand::RDI, Operand::RSI, Operand::RDX, Operand::RCX, Operand::R8, Operand::R9, Operand::R10, Operand::R11, 730 #endif 731 Operand::RBX, Operand::RBP, Operand::R12, Operand::R13, Operand::R14, Operand::R15 732 }; 733 return &tbl[0]; 734 } getRegIdx(int & pos)735 int getRegIdx(int& pos) const 736 { 737 assert(pos < maxRegNum); 738 using namespace Xbyak; 739 const int *tbl = getOrderTbl(); 740 int r = tbl[pos++]; 741 if (useRcx_) { 742 if (r == Operand::RCX) { return Operand::R10; } 743 if (r == Operand::R10) { r = tbl[pos++]; } 744 } 745 if (useRdx_) { 746 if (r == Operand::RDX) { return Operand::R11; } 747 if (r == Operand::R11) { return tbl[pos++]; } 748 } 749 return r; 750 } 751 }; 752 #endif 753 754 class Profiler { 755 int mode_; 756 const char *suffix_; 757 const void *startAddr_; 758 #ifdef XBYAK_USE_PERF 759 FILE *fp_; 760 #endif 761 public: 762 enum { 763 None = 0, 764 Perf = 1, 765 VTune = 2 766 }; Profiler()767 Profiler() 768 : mode_(None) 769 , suffix_("") 770 , startAddr_(0) 771 #ifdef XBYAK_USE_PERF 772 , fp_(0) 773 #endif 774 { 775 } 776 // append suffix to funcName setNameSuffix(const char * suffix)777 void setNameSuffix(const char *suffix) 778 { 779 suffix_ = suffix; 780 } setStartAddr(const void * startAddr)781 void setStartAddr(const void *startAddr) 782 { 783 startAddr_ = startAddr; 784 } init(int mode)785 void init(int mode) 786 { 787 mode_ = None; 788 switch (mode) { 789 default: 790 case None: 791 return; 792 case Perf: 793 #ifdef XBYAK_USE_PERF 794 close(); 795 { 796 const int pid = getpid(); 797 char name[128]; 798 snprintf(name, sizeof(name), "/tmp/perf-%d.map", pid); 799 fp_ = fopen(name, "a+"); 800 if (fp_ == 0) { 801 fprintf(stderr, "can't open %s\n", name); 802 return; 803 } 804 } 805 mode_ = Perf; 806 #endif 807 return; 808 case VTune: 809 #ifdef XBYAK_USE_VTUNE 810 dlopen("dummy", RTLD_LAZY); // force to load dlopen to enable jit profiling 811 if (iJIT_IsProfilingActive() != iJIT_SAMPLING_ON) { 812 fprintf(stderr, "VTune profiling is not active\n"); 813 return; 814 } 815 mode_ = VTune; 816 #endif 817 return; 818 } 819 } ~Profiler()820 ~Profiler() 821 { 822 close(); 823 } close()824 void close() 825 { 826 #ifdef XBYAK_USE_PERF 827 if (fp_ == 0) return; 828 fclose(fp_); 829 fp_ = 0; 830 #endif 831 } set(const char * funcName,const void * startAddr,size_t funcSize)832 void set(const char *funcName, const void *startAddr, size_t funcSize) const 833 { 834 if (mode_ == None) return; 835 #if !defined(XBYAK_USE_PERF) && !defined(XBYAK_USE_VTUNE) 836 (void)funcName; 837 (void)startAddr; 838 (void)funcSize; 839 #endif 840 #ifdef XBYAK_USE_PERF 841 if (mode_ == Perf) { 842 if (fp_ == 0) return; 843 fprintf(fp_, "%llx %zx %s%s", (long long)startAddr, funcSize, funcName, suffix_); 844 /* 845 perf does not recognize the function name which is less than 3, 846 so append '_' at the end of the name if necessary 847 */ 848 size_t n = strlen(funcName) + strlen(suffix_); 849 for (size_t i = n; i < 3; i++) { 850 fprintf(fp_, "_"); 851 } 852 fprintf(fp_, "\n"); 853 fflush(fp_); 854 } 855 #endif 856 #ifdef XBYAK_USE_VTUNE 857 if (mode_ != VTune) return; 858 char className[] = ""; 859 char fileName[] = ""; 860 iJIT_Method_Load jmethod = {}; 861 jmethod.method_id = iJIT_GetNewMethodID(); 862 jmethod.class_file_name = className; 863 jmethod.source_file_name = fileName; 864 jmethod.method_load_address = const_cast<void*>(startAddr); 865 jmethod.method_size = funcSize; 866 jmethod.line_number_size = 0; 867 char buf[128]; 868 snprintf(buf, sizeof(buf), "%s%s", funcName, suffix_); 869 jmethod.method_name = buf; 870 iJIT_NotifyEvent(iJVM_EVENT_TYPE_METHOD_LOAD_FINISHED, (void*)&jmethod); 871 #endif 872 } 873 /* 874 for continuous set 875 funcSize = endAddr - <previous set endAddr> 876 */ set(const char * funcName,const void * endAddr)877 void set(const char *funcName, const void *endAddr) 878 { 879 set(funcName, startAddr_, (size_t)endAddr - (size_t)startAddr_); 880 startAddr_ = endAddr; 881 } 882 }; 883 884 } } // end of util 885 #endif 886