1 /******************************************************************************* 2 * Copyright 2016-2020 Intel Corporation 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 *******************************************************************************/ 16 17 /******************************************************************************* 18 * Copyright (c) 2007 MITSUNARI Shigeo 19 * All rights reserved. 20 * 21 * Redistribution and use in source and binary forms, with or without 22 * modification, are permitted provided that the following conditions are met: 23 * 24 * Redistributions of source code must retain the above copyright notice, this 25 * list of conditions and the following disclaimer. 26 * Redistributions in binary form must reproduce the above copyright notice, 27 * this list of conditions and the following disclaimer in the documentation 28 * and/or other materials provided with the distribution. 29 * Neither the name of the copyright owner nor the names of its contributors may 30 * be used to endorse or promote products derived from this software without 31 * specific prior written permission. 32 * 33 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 34 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 35 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 36 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 37 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 38 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 39 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 40 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 41 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 42 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 43 * THE POSSIBILITY OF SUCH DAMAGE. 44 *******************************************************************************/ 45 46 #ifndef XBYAK_XBYAK_UTIL_H_ 47 #define XBYAK_XBYAK_UTIL_H_ 48 49 #ifdef XBYAK_ONLY_CLASS_CPU 50 #include <stdint.h> 51 #include <stdlib.h> 52 #include <algorithm> 53 #include <assert.h> 54 #ifndef XBYAK_THROW 55 #define XBYAK_THROW(x) ; 56 #define XBYAK_THROW_RET(x, y) return y; 57 #endif 58 #else 59 #include <string.h> 60 61 /** 62 utility class and functions for Xbyak 63 Xbyak::util::Clock ; rdtsc timer 64 Xbyak::util::Cpu ; detect CPU 65 @note this header is UNDER CONSTRUCTION! 66 */ 67 #include "xbyak.h" 68 #endif // XBYAK_ONLY_CLASS_CPU 69 70 #if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64) 71 #define XBYAK_INTEL_CPU_SPECIFIC 72 #endif 73 74 #ifdef XBYAK_INTEL_CPU_SPECIFIC 75 #ifdef _MSC_VER 76 #if (_MSC_VER < 1400) && defined(XBYAK32) __cpuid(int[4],int)77 static inline __declspec(naked) void __cpuid(int[4], int) 78 { 79 __asm { 80 push ebx 81 push esi 82 mov eax, dword ptr [esp + 4 * 2 + 8] // eaxIn 83 cpuid 84 mov esi, dword ptr [esp + 4 * 2 + 4] // data 85 mov dword ptr [esi], eax 86 mov dword ptr [esi + 4], ebx 87 mov dword ptr [esi + 8], ecx 88 mov dword ptr [esi + 12], edx 89 pop esi 90 pop ebx 91 ret 92 } 93 } 94 #else 95 #include <intrin.h> // for __cpuid 96 #endif 97 #else 98 #ifndef __GNUC_PREREQ 99 #define __GNUC_PREREQ(major, minor) ((((__GNUC__) << 16) + (__GNUC_MINOR__)) >= (((major) << 16) + (minor))) 100 #endif 101 #if __GNUC_PREREQ(4, 3) && !defined(__APPLE__) 102 #include <cpuid.h> 103 #else 104 #if defined(__APPLE__) && defined(XBYAK32) // avoid err : can't find a register in class `BREG' while reloading `asm' 105 #define __cpuid(eaxIn, a, b, c, d) __asm__ __volatile__("pushl %%ebx\ncpuid\nmovl %%ebp, %%esi\npopl %%ebx" : "=a"(a), "=S"(b), "=c"(c), "=d"(d) : "0"(eaxIn)) 106 #define __cpuid_count(eaxIn, ecxIn, a, b, c, d) __asm__ __volatile__("pushl %%ebx\ncpuid\nmovl %%ebp, %%esi\npopl %%ebx" : "=a"(a), "=S"(b), "=c"(c), "=d"(d) : "0"(eaxIn), "2"(ecxIn)) 107 #else 108 #define __cpuid(eaxIn, a, b, c, d) __asm__ __volatile__("cpuid\n" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "0"(eaxIn)) 109 #define __cpuid_count(eaxIn, ecxIn, a, b, c, d) __asm__ __volatile__("cpuid\n" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "0"(eaxIn), "2"(ecxIn)) 110 #endif 111 #endif 112 #endif 113 #endif 114 115 #ifdef XBYAK_USE_VTUNE 116 // -I /opt/intel/vtune_amplifier/include/ -L /opt/intel/vtune_amplifier/lib64 -ljitprofiling -ldl 117 #include <jitprofiling.h> 118 #ifdef _MSC_VER 119 #pragma comment(lib, "libittnotify.lib") 120 #endif 121 #ifdef __linux__ 122 #include <dlfcn.h> 123 #endif 124 #endif 125 #ifdef __linux__ 126 #define XBYAK_USE_PERF 127 #endif 128 129 namespace Xbyak { namespace util { 130 131 typedef enum { 132 SmtLevel = 1, 133 CoreLevel = 2 134 } IntelCpuTopologyLevel; 135 136 /** 137 CPU detection class 138 */ 139 class Cpu { 140 uint64_t type_; 141 //system topology 142 bool x2APIC_supported_; 143 static const size_t maxTopologyLevels = 2; 144 unsigned int numCores_[maxTopologyLevels]; 145 146 static const unsigned int maxNumberCacheLevels = 10; 147 unsigned int dataCacheSize_[maxNumberCacheLevels]; 148 unsigned int coresSharignDataCache_[maxNumberCacheLevels]; 149 unsigned int dataCacheLevels_; 150 get32bitAsBE(const char * x)151 unsigned int get32bitAsBE(const char *x) const 152 { 153 return x[0] | (x[1] << 8) | (x[2] << 16) | (x[3] << 24); 154 } mask(int n)155 unsigned int mask(int n) const 156 { 157 return (1U << n) - 1; 158 } setFamily()159 void setFamily() 160 { 161 unsigned int data[4] = {}; 162 getCpuid(1, data); 163 stepping = data[0] & mask(4); 164 model = (data[0] >> 4) & mask(4); 165 family = (data[0] >> 8) & mask(4); 166 // type = (data[0] >> 12) & mask(2); 167 extModel = (data[0] >> 16) & mask(4); 168 extFamily = (data[0] >> 20) & mask(8); 169 if (family == 0x0f) { 170 displayFamily = family + extFamily; 171 } else { 172 displayFamily = family; 173 } 174 if (family == 6 || family == 0x0f) { 175 displayModel = (extModel << 4) + model; 176 } else { 177 displayModel = model; 178 } 179 } extractBit(unsigned int val,unsigned int base,unsigned int end)180 unsigned int extractBit(unsigned int val, unsigned int base, unsigned int end) 181 { 182 return (val >> base) & ((1u << (end - base)) - 1); 183 } setNumCores()184 void setNumCores() 185 { 186 if ((type_ & tINTEL) == 0) return; 187 188 unsigned int data[4] = {}; 189 190 /* CAUTION: These numbers are configuration as shipped by Intel. */ 191 getCpuidEx(0x0, 0, data); 192 if (data[0] >= 0xB) { 193 /* 194 if leaf 11 exists(x2APIC is supported), 195 we use it to get the number of smt cores and cores on socket 196 197 leaf 0xB can be zeroed-out by a hypervisor 198 */ 199 x2APIC_supported_ = true; 200 for (unsigned int i = 0; i < maxTopologyLevels; i++) { 201 getCpuidEx(0xB, i, data); 202 IntelCpuTopologyLevel level = (IntelCpuTopologyLevel)extractBit(data[2], 8, 15); 203 if (level == SmtLevel || level == CoreLevel) { 204 numCores_[level - 1] = extractBit(data[1], 0, 15); 205 } 206 } 207 /* 208 Fallback values in case a hypervisor has 0xB leaf zeroed-out. 209 */ 210 numCores_[SmtLevel - 1] = (std::max)(1u, numCores_[SmtLevel - 1]); 211 numCores_[CoreLevel - 1] = (std::max)(numCores_[SmtLevel - 1], numCores_[CoreLevel - 1]); 212 } else { 213 /* 214 Failed to deremine num of cores without x2APIC support. 215 TODO: USE initial APIC ID to determine ncores. 216 */ 217 numCores_[SmtLevel - 1] = 0; 218 numCores_[CoreLevel - 1] = 0; 219 } 220 221 } setCacheHierarchy()222 void setCacheHierarchy() 223 { 224 if ((type_ & tINTEL) == 0) return; 225 const unsigned int NO_CACHE = 0; 226 const unsigned int DATA_CACHE = 1; 227 // const unsigned int INSTRUCTION_CACHE = 2; 228 const unsigned int UNIFIED_CACHE = 3; 229 unsigned int smt_width = 0; 230 unsigned int logical_cores = 0; 231 unsigned int data[4] = {}; 232 233 if (x2APIC_supported_) { 234 smt_width = numCores_[0]; 235 logical_cores = numCores_[1]; 236 } 237 238 /* 239 Assumptions: 240 the first level of data cache is not shared (which is the 241 case for every existing architecture) and use this to 242 determine the SMT width for arch not supporting leaf 11. 243 when leaf 4 reports a number of core less than numCores_ 244 on socket reported by leaf 11, then it is a correct number 245 of cores not an upperbound. 246 */ 247 for (int i = 0; dataCacheLevels_ < maxNumberCacheLevels; i++) { 248 getCpuidEx(0x4, i, data); 249 unsigned int cacheType = extractBit(data[0], 0, 4); 250 if (cacheType == NO_CACHE) break; 251 if (cacheType == DATA_CACHE || cacheType == UNIFIED_CACHE) { 252 unsigned int actual_logical_cores = extractBit(data[0], 14, 25) + 1; 253 if (logical_cores != 0) { // true only if leaf 0xB is supported and valid 254 actual_logical_cores = (std::min)(actual_logical_cores, logical_cores); 255 } 256 assert(actual_logical_cores != 0); 257 dataCacheSize_[dataCacheLevels_] = 258 (extractBit(data[1], 22, 31) + 1) 259 * (extractBit(data[1], 12, 21) + 1) 260 * (extractBit(data[1], 0, 11) + 1) 261 * (data[2] + 1); 262 if (cacheType == DATA_CACHE && smt_width == 0) smt_width = actual_logical_cores; 263 assert(smt_width != 0); 264 coresSharignDataCache_[dataCacheLevels_] = (std::max)(actual_logical_cores / smt_width, 1u); 265 dataCacheLevels_++; 266 } 267 } 268 } 269 270 public: 271 int model; 272 int family; 273 int stepping; 274 int extModel; 275 int extFamily; 276 int displayFamily; // family + extFamily 277 int displayModel; // model + extModel 278 getNumCores(IntelCpuTopologyLevel level)279 unsigned int getNumCores(IntelCpuTopologyLevel level) const { 280 if (!x2APIC_supported_) XBYAK_THROW_RET(ERR_X2APIC_IS_NOT_SUPPORTED, 0) 281 switch (level) { 282 case SmtLevel: return numCores_[level - 1]; 283 case CoreLevel: return numCores_[level - 1] / numCores_[SmtLevel - 1]; 284 default: XBYAK_THROW_RET(ERR_X2APIC_IS_NOT_SUPPORTED, 0) 285 } 286 } 287 getDataCacheLevels()288 unsigned int getDataCacheLevels() const { return dataCacheLevels_; } getCoresSharingDataCache(unsigned int i)289 unsigned int getCoresSharingDataCache(unsigned int i) const 290 { 291 if (i >= dataCacheLevels_) XBYAK_THROW_RET(ERR_BAD_PARAMETER, 0) 292 return coresSharignDataCache_[i]; 293 } getDataCacheSize(unsigned int i)294 unsigned int getDataCacheSize(unsigned int i) const 295 { 296 if (i >= dataCacheLevels_) XBYAK_THROW_RET(ERR_BAD_PARAMETER, 0) 297 return dataCacheSize_[i]; 298 } 299 300 /* 301 data[] = { eax, ebx, ecx, edx } 302 */ getCpuid(unsigned int eaxIn,unsigned int data[4])303 static inline void getCpuid(unsigned int eaxIn, unsigned int data[4]) 304 { 305 #ifdef XBYAK_INTEL_CPU_SPECIFIC 306 #ifdef _MSC_VER 307 __cpuid(reinterpret_cast<int*>(data), eaxIn); 308 #else 309 __cpuid(eaxIn, data[0], data[1], data[2], data[3]); 310 #endif 311 #else 312 (void)eaxIn; 313 (void)data; 314 #endif 315 } getCpuidEx(unsigned int eaxIn,unsigned int ecxIn,unsigned int data[4])316 static inline void getCpuidEx(unsigned int eaxIn, unsigned int ecxIn, unsigned int data[4]) 317 { 318 #ifdef XBYAK_INTEL_CPU_SPECIFIC 319 #ifdef _MSC_VER 320 __cpuidex(reinterpret_cast<int*>(data), eaxIn, ecxIn); 321 #else 322 __cpuid_count(eaxIn, ecxIn, data[0], data[1], data[2], data[3]); 323 #endif 324 #else 325 (void)eaxIn; 326 (void)ecxIn; 327 (void)data; 328 #endif 329 } getXfeature()330 static inline uint64_t getXfeature() 331 { 332 #ifdef XBYAK_INTEL_CPU_SPECIFIC 333 #ifdef _MSC_VER 334 return _xgetbv(0); 335 #else 336 unsigned int eax, edx; 337 // xgetvb is not support on gcc 4.2 338 // __asm__ volatile("xgetbv" : "=a"(eax), "=d"(edx) : "c"(0)); 339 __asm__ volatile(".byte 0x0f, 0x01, 0xd0" : "=a"(eax), "=d"(edx) : "c"(0)); 340 return ((uint64_t)edx << 32) | eax; 341 #endif 342 #else 343 return 0; 344 #endif 345 } 346 typedef uint64_t Type; 347 348 static const Type NONE = 0; 349 static const Type tMMX = 1 << 0; 350 static const Type tMMX2 = 1 << 1; 351 static const Type tCMOV = 1 << 2; 352 static const Type tSSE = 1 << 3; 353 static const Type tSSE2 = 1 << 4; 354 static const Type tSSE3 = 1 << 5; 355 static const Type tSSSE3 = 1 << 6; 356 static const Type tSSE41 = 1 << 7; 357 static const Type tSSE42 = 1 << 8; 358 static const Type tPOPCNT = 1 << 9; 359 static const Type tAESNI = 1 << 10; 360 static const Type tSSE5 = 1 << 11; 361 static const Type tOSXSAVE = 1 << 12; 362 static const Type tPCLMULQDQ = 1 << 13; 363 static const Type tAVX = 1 << 14; 364 static const Type tFMA = 1 << 15; 365 366 static const Type t3DN = 1 << 16; 367 static const Type tE3DN = 1 << 17; 368 static const Type tSSE4a = 1 << 18; 369 static const Type tRDTSCP = 1 << 19; 370 static const Type tAVX2 = 1 << 20; 371 static const Type tBMI1 = 1 << 21; // andn, bextr, blsi, blsmsk, blsr, tzcnt 372 static const Type tBMI2 = 1 << 22; // bzhi, mulx, pdep, pext, rorx, sarx, shlx, shrx 373 static const Type tLZCNT = 1 << 23; 374 375 static const Type tINTEL = 1 << 24; 376 static const Type tAMD = 1 << 25; 377 378 static const Type tENHANCED_REP = 1 << 26; // enhanced rep movsb/stosb 379 static const Type tRDRAND = 1 << 27; 380 static const Type tADX = 1 << 28; // adcx, adox 381 static const Type tRDSEED = 1 << 29; // rdseed 382 static const Type tSMAP = 1 << 30; // stac 383 static const Type tHLE = uint64_t(1) << 31; // xacquire, xrelease, xtest 384 static const Type tRTM = uint64_t(1) << 32; // xbegin, xend, xabort 385 static const Type tF16C = uint64_t(1) << 33; // vcvtph2ps, vcvtps2ph 386 static const Type tMOVBE = uint64_t(1) << 34; // mobve 387 static const Type tAVX512F = uint64_t(1) << 35; 388 static const Type tAVX512DQ = uint64_t(1) << 36; 389 static const Type tAVX512_IFMA = uint64_t(1) << 37; 390 static const Type tAVX512IFMA = tAVX512_IFMA; 391 static const Type tAVX512PF = uint64_t(1) << 38; 392 static const Type tAVX512ER = uint64_t(1) << 39; 393 static const Type tAVX512CD = uint64_t(1) << 40; 394 static const Type tAVX512BW = uint64_t(1) << 41; 395 static const Type tAVX512VL = uint64_t(1) << 42; 396 static const Type tAVX512_VBMI = uint64_t(1) << 43; 397 static const Type tAVX512VBMI = tAVX512_VBMI; // changed by Intel's manual 398 static const Type tAVX512_4VNNIW = uint64_t(1) << 44; 399 static const Type tAVX512_4FMAPS = uint64_t(1) << 45; 400 static const Type tPREFETCHWT1 = uint64_t(1) << 46; 401 static const Type tPREFETCHW = uint64_t(1) << 47; 402 static const Type tSHA = uint64_t(1) << 48; 403 static const Type tMPX = uint64_t(1) << 49; 404 static const Type tAVX512_VBMI2 = uint64_t(1) << 50; 405 static const Type tGFNI = uint64_t(1) << 51; 406 static const Type tVAES = uint64_t(1) << 52; 407 static const Type tVPCLMULQDQ = uint64_t(1) << 53; 408 static const Type tAVX512_VNNI = uint64_t(1) << 54; 409 static const Type tAVX512_BITALG = uint64_t(1) << 55; 410 static const Type tAVX512_VPOPCNTDQ = uint64_t(1) << 56; 411 static const Type tAVX512_BF16 = uint64_t(1) << 57; 412 static const Type tAVX512_VP2INTERSECT = uint64_t(1) << 58; 413 static const Type tAMX_TILE = uint64_t(1) << 59; 414 static const Type tAMX_INT8 = uint64_t(1) << 60; 415 static const Type tAMX_BF16 = uint64_t(1) << 61; 416 static const Type tAVX_VNNI = uint64_t(1) << 62; 417 Cpu()418 Cpu() 419 : type_(NONE) 420 , x2APIC_supported_(false) 421 , numCores_() 422 , dataCacheSize_() 423 , coresSharignDataCache_() 424 , dataCacheLevels_(0) 425 { 426 unsigned int data[4] = {}; 427 const unsigned int& EAX = data[0]; 428 const unsigned int& EBX = data[1]; 429 const unsigned int& ECX = data[2]; 430 const unsigned int& EDX = data[3]; 431 getCpuid(0, data); 432 const unsigned int maxNum = EAX; 433 static const char intel[] = "ntel"; 434 static const char amd[] = "cAMD"; 435 if (ECX == get32bitAsBE(amd)) { 436 type_ |= tAMD; 437 getCpuid(0x80000001, data); 438 if (EDX & (1U << 31)) { 439 type_ |= t3DN; 440 // 3DNow! implies support for PREFETCHW on AMD 441 type_ |= tPREFETCHW; 442 } 443 444 if (EDX & (1U << 29)) { 445 // Long mode implies support for PREFETCHW on AMD 446 type_ |= tPREFETCHW; 447 } 448 } 449 if (ECX == get32bitAsBE(intel)) { 450 type_ |= tINTEL; 451 } 452 453 // Extended flags information 454 getCpuid(0x80000000, data); 455 if (EAX >= 0x80000001) { 456 getCpuid(0x80000001, data); 457 458 if (EDX & (1U << 31)) type_ |= t3DN; 459 if (EDX & (1U << 30)) type_ |= tE3DN; 460 if (EDX & (1U << 27)) type_ |= tRDTSCP; 461 if (EDX & (1U << 22)) type_ |= tMMX2; 462 if (EDX & (1U << 15)) type_ |= tCMOV; 463 if (ECX & (1U << 5)) type_ |= tLZCNT; 464 if (ECX & (1U << 8)) type_ |= tPREFETCHW; 465 } 466 467 getCpuid(1, data); 468 if (ECX & (1U << 0)) type_ |= tSSE3; 469 if (ECX & (1U << 9)) type_ |= tSSSE3; 470 if (ECX & (1U << 19)) type_ |= tSSE41; 471 if (ECX & (1U << 20)) type_ |= tSSE42; 472 if (ECX & (1U << 22)) type_ |= tMOVBE; 473 if (ECX & (1U << 23)) type_ |= tPOPCNT; 474 if (ECX & (1U << 25)) type_ |= tAESNI; 475 if (ECX & (1U << 1)) type_ |= tPCLMULQDQ; 476 if (ECX & (1U << 27)) type_ |= tOSXSAVE; 477 if (ECX & (1U << 30)) type_ |= tRDRAND; 478 if (ECX & (1U << 29)) type_ |= tF16C; 479 480 if (EDX & (1U << 15)) type_ |= tCMOV; 481 if (EDX & (1U << 23)) type_ |= tMMX; 482 if (EDX & (1U << 25)) type_ |= tMMX2 | tSSE; 483 if (EDX & (1U << 26)) type_ |= tSSE2; 484 485 if (type_ & tOSXSAVE) { 486 // check XFEATURE_ENABLED_MASK[2:1] = '11b' 487 uint64_t bv = getXfeature(); 488 if ((bv & 6) == 6) { 489 if (ECX & (1U << 28)) type_ |= tAVX; 490 if (ECX & (1U << 12)) type_ |= tFMA; 491 // do *not* check AVX-512 state on macOS because it has on-demand AVX-512 support 492 #if !defined(__APPLE__) 493 if (((bv >> 5) & 7) == 7) 494 #endif 495 { 496 getCpuidEx(7, 0, data); 497 if (EBX & (1U << 16)) type_ |= tAVX512F; 498 if (type_ & tAVX512F) { 499 if (EBX & (1U << 17)) type_ |= tAVX512DQ; 500 if (EBX & (1U << 21)) type_ |= tAVX512_IFMA; 501 if (EBX & (1U << 26)) type_ |= tAVX512PF; 502 if (EBX & (1U << 27)) type_ |= tAVX512ER; 503 if (EBX & (1U << 28)) type_ |= tAVX512CD; 504 if (EBX & (1U << 30)) type_ |= tAVX512BW; 505 if (EBX & (1U << 31)) type_ |= tAVX512VL; 506 if (ECX & (1U << 1)) type_ |= tAVX512_VBMI; 507 if (ECX & (1U << 6)) type_ |= tAVX512_VBMI2; 508 if (ECX & (1U << 8)) type_ |= tGFNI; 509 if (ECX & (1U << 9)) type_ |= tVAES; 510 if (ECX & (1U << 10)) type_ |= tVPCLMULQDQ; 511 if (ECX & (1U << 11)) type_ |= tAVX512_VNNI; 512 if (ECX & (1U << 12)) type_ |= tAVX512_BITALG; 513 if (ECX & (1U << 14)) type_ |= tAVX512_VPOPCNTDQ; 514 if (EDX & (1U << 2)) type_ |= tAVX512_4VNNIW; 515 if (EDX & (1U << 3)) type_ |= tAVX512_4FMAPS; 516 if (EDX & (1U << 8)) type_ |= tAVX512_VP2INTERSECT; 517 } 518 } 519 } 520 } 521 if (maxNum >= 7) { 522 getCpuidEx(7, 0, data); 523 const uint32_t maxNumSubLeaves = EAX; 524 if (type_ & tAVX && (EBX & (1U << 5))) type_ |= tAVX2; 525 if (EBX & (1U << 3)) type_ |= tBMI1; 526 if (EBX & (1U << 8)) type_ |= tBMI2; 527 if (EBX & (1U << 9)) type_ |= tENHANCED_REP; 528 if (EBX & (1U << 18)) type_ |= tRDSEED; 529 if (EBX & (1U << 19)) type_ |= tADX; 530 if (EBX & (1U << 20)) type_ |= tSMAP; 531 if (EBX & (1U << 4)) type_ |= tHLE; 532 if (EBX & (1U << 11)) type_ |= tRTM; 533 if (EBX & (1U << 14)) type_ |= tMPX; 534 if (EBX & (1U << 29)) type_ |= tSHA; 535 if (ECX & (1U << 0)) type_ |= tPREFETCHWT1; 536 if (EDX & (1U << 24)) type_ |= tAMX_TILE; 537 if (EDX & (1U << 25)) type_ |= tAMX_INT8; 538 if (EDX & (1U << 22)) type_ |= tAMX_BF16; 539 if (maxNumSubLeaves >= 1) { 540 getCpuidEx(7, 1, data); 541 if (EAX & (1U << 4)) type_ |= tAVX_VNNI; 542 if (type_ & tAVX512F) { 543 if (EAX & (1U << 5)) type_ |= tAVX512_BF16; 544 } 545 } 546 } 547 setFamily(); 548 setNumCores(); 549 setCacheHierarchy(); 550 } putFamily()551 void putFamily() const 552 { 553 #ifndef XBYAK_ONLY_CLASS_CPU 554 printf("family=%d, model=%X, stepping=%d, extFamily=%d, extModel=%X\n", 555 family, model, stepping, extFamily, extModel); 556 printf("display:family=%X, model=%X\n", displayFamily, displayModel); 557 #endif 558 } has(Type type)559 bool has(Type type) const 560 { 561 return (type & type_) != 0; 562 } 563 }; 564 565 #ifndef XBYAK_ONLY_CLASS_CPU 566 class Clock { 567 public: getRdtsc()568 static inline uint64_t getRdtsc() 569 { 570 #ifdef XBYAK_INTEL_CPU_SPECIFIC 571 #ifdef _MSC_VER 572 return __rdtsc(); 573 #else 574 unsigned int eax, edx; 575 __asm__ volatile("rdtsc" : "=a"(eax), "=d"(edx)); 576 return ((uint64_t)edx << 32) | eax; 577 #endif 578 #else 579 // TODO: Need another impl of Clock or rdtsc-equivalent for non-x86 cpu 580 return 0; 581 #endif 582 } Clock()583 Clock() 584 : clock_(0) 585 , count_(0) 586 { 587 } begin()588 void begin() 589 { 590 clock_ -= getRdtsc(); 591 } end()592 void end() 593 { 594 clock_ += getRdtsc(); 595 count_++; 596 } getCount()597 int getCount() const { return count_; } getClock()598 uint64_t getClock() const { return clock_; } clear()599 void clear() { count_ = 0; clock_ = 0; } 600 private: 601 uint64_t clock_; 602 int count_; 603 }; 604 605 #ifdef XBYAK64 606 const int UseRCX = 1 << 6; 607 const int UseRDX = 1 << 7; 608 609 class Pack { 610 static const size_t maxTblNum = 15; 611 const Xbyak::Reg64 *tbl_[maxTblNum]; 612 size_t n_; 613 public: Pack()614 Pack() : tbl_(), n_(0) {} Pack(const Xbyak::Reg64 * tbl,size_t n)615 Pack(const Xbyak::Reg64 *tbl, size_t n) { init(tbl, n); } Pack(const Pack & rhs)616 Pack(const Pack& rhs) 617 : n_(rhs.n_) 618 { 619 for (size_t i = 0; i < n_; i++) tbl_[i] = rhs.tbl_[i]; 620 } 621 Pack& operator=(const Pack& rhs) 622 { 623 n_ = rhs.n_; 624 for (size_t i = 0; i < n_; i++) tbl_[i] = rhs.tbl_[i]; 625 return *this; 626 } Pack(const Xbyak::Reg64 & t0)627 Pack(const Xbyak::Reg64& t0) 628 { n_ = 1; tbl_[0] = &t0; } Pack(const Xbyak::Reg64 & t1,const Xbyak::Reg64 & t0)629 Pack(const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) 630 { n_ = 2; tbl_[0] = &t0; tbl_[1] = &t1; } Pack(const Xbyak::Reg64 & t2,const Xbyak::Reg64 & t1,const Xbyak::Reg64 & t0)631 Pack(const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) 632 { n_ = 3; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; } Pack(const Xbyak::Reg64 & t3,const Xbyak::Reg64 & t2,const Xbyak::Reg64 & t1,const Xbyak::Reg64 & t0)633 Pack(const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) 634 { n_ = 4; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; } Pack(const Xbyak::Reg64 & t4,const Xbyak::Reg64 & t3,const Xbyak::Reg64 & t2,const Xbyak::Reg64 & t1,const Xbyak::Reg64 & t0)635 Pack(const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) 636 { n_ = 5; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; } Pack(const Xbyak::Reg64 & t5,const Xbyak::Reg64 & t4,const Xbyak::Reg64 & t3,const Xbyak::Reg64 & t2,const Xbyak::Reg64 & t1,const Xbyak::Reg64 & t0)637 Pack(const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) 638 { n_ = 6; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; } Pack(const Xbyak::Reg64 & t6,const Xbyak::Reg64 & t5,const Xbyak::Reg64 & t4,const Xbyak::Reg64 & t3,const Xbyak::Reg64 & t2,const Xbyak::Reg64 & t1,const Xbyak::Reg64 & t0)639 Pack(const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) 640 { n_ = 7; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; tbl_[6] = &t6; } Pack(const Xbyak::Reg64 & t7,const Xbyak::Reg64 & t6,const Xbyak::Reg64 & t5,const Xbyak::Reg64 & t4,const Xbyak::Reg64 & t3,const Xbyak::Reg64 & t2,const Xbyak::Reg64 & t1,const Xbyak::Reg64 & t0)641 Pack(const Xbyak::Reg64& t7, const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) 642 { n_ = 8; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; tbl_[6] = &t6; tbl_[7] = &t7; } Pack(const Xbyak::Reg64 & t8,const Xbyak::Reg64 & t7,const Xbyak::Reg64 & t6,const Xbyak::Reg64 & t5,const Xbyak::Reg64 & t4,const Xbyak::Reg64 & t3,const Xbyak::Reg64 & t2,const Xbyak::Reg64 & t1,const Xbyak::Reg64 & t0)643 Pack(const Xbyak::Reg64& t8, const Xbyak::Reg64& t7, const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) 644 { n_ = 9; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; tbl_[6] = &t6; tbl_[7] = &t7; tbl_[8] = &t8; } Pack(const Xbyak::Reg64 & t9,const Xbyak::Reg64 & t8,const Xbyak::Reg64 & t7,const Xbyak::Reg64 & t6,const Xbyak::Reg64 & t5,const Xbyak::Reg64 & t4,const Xbyak::Reg64 & t3,const Xbyak::Reg64 & t2,const Xbyak::Reg64 & t1,const Xbyak::Reg64 & t0)645 Pack(const Xbyak::Reg64& t9, const Xbyak::Reg64& t8, const Xbyak::Reg64& t7, const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) 646 { n_ = 10; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; tbl_[6] = &t6; tbl_[7] = &t7; tbl_[8] = &t8; tbl_[9] = &t9; } append(const Xbyak::Reg64 & t)647 Pack& append(const Xbyak::Reg64& t) 648 { 649 if (n_ == maxTblNum) { 650 fprintf(stderr, "ERR Pack::can't append\n"); 651 XBYAK_THROW_RET(ERR_BAD_PARAMETER, *this) 652 } 653 tbl_[n_++] = &t; 654 return *this; 655 } init(const Xbyak::Reg64 * tbl,size_t n)656 void init(const Xbyak::Reg64 *tbl, size_t n) 657 { 658 if (n > maxTblNum) { 659 fprintf(stderr, "ERR Pack::init bad n=%d\n", (int)n); 660 XBYAK_THROW(ERR_BAD_PARAMETER) 661 } 662 n_ = n; 663 for (size_t i = 0; i < n; i++) { 664 tbl_[i] = &tbl[i]; 665 } 666 } 667 const Xbyak::Reg64& operator[](size_t n) const 668 { 669 if (n >= n_) { 670 fprintf(stderr, "ERR Pack bad n=%d(%d)\n", (int)n, (int)n_); 671 XBYAK_THROW_RET(ERR_BAD_PARAMETER, rax) 672 } 673 return *tbl_[n]; 674 } size()675 size_t size() const { return n_; } 676 /* 677 get tbl[pos, pos + num) 678 */ 679 Pack sub(size_t pos, size_t num = size_t(-1)) const 680 { 681 if (num == size_t(-1)) num = n_ - pos; 682 if (pos + num > n_) { 683 fprintf(stderr, "ERR Pack::sub bad pos=%d, num=%d\n", (int)pos, (int)num); 684 XBYAK_THROW_RET(ERR_BAD_PARAMETER, Pack()) 685 } 686 Pack pack; 687 pack.n_ = num; 688 for (size_t i = 0; i < num; i++) { 689 pack.tbl_[i] = tbl_[pos + i]; 690 } 691 return pack; 692 } put()693 void put() const 694 { 695 for (size_t i = 0; i < n_; i++) { 696 printf("%s ", tbl_[i]->toString()); 697 } 698 printf("\n"); 699 } 700 }; 701 702 class StackFrame { 703 #ifdef XBYAK64_WIN 704 static const int noSaveNum = 6; 705 static const int rcxPos = 0; 706 static const int rdxPos = 1; 707 #else 708 static const int noSaveNum = 8; 709 static const int rcxPos = 3; 710 static const int rdxPos = 2; 711 #endif 712 static const int maxRegNum = 14; // maxRegNum = 16 - rsp - rax 713 Xbyak::CodeGenerator *code_; 714 int pNum_; 715 int tNum_; 716 bool useRcx_; 717 bool useRdx_; 718 int saveNum_; 719 int P_; 720 bool makeEpilog_; 721 Xbyak::Reg64 pTbl_[4]; 722 Xbyak::Reg64 tTbl_[maxRegNum]; 723 Pack p_; 724 Pack t_; 725 StackFrame(const StackFrame&); 726 void operator=(const StackFrame&); 727 public: 728 const Pack& p; 729 const Pack& t; 730 /* 731 make stack frame 732 @param sf [in] this 733 @param pNum [in] num of function parameter(0 <= pNum <= 4) 734 @param tNum [in] num of temporary register(0 <= tNum, with UseRCX, UseRDX) #{pNum + tNum [+rcx] + [rdx]} <= 14 735 @param stackSizeByte [in] local stack size 736 @param makeEpilog [in] automatically call close() if true 737 738 you can use 739 rax 740 gp0, ..., gp(pNum - 1) 741 gt0, ..., gt(tNum-1) 742 rcx if tNum & UseRCX 743 rdx if tNum & UseRDX 744 rsp[0..stackSizeByte - 1] 745 */ 746 StackFrame(Xbyak::CodeGenerator *code, int pNum, int tNum = 0, int stackSizeByte = 0, bool makeEpilog = true) code_(code)747 : code_(code) 748 , pNum_(pNum) 749 , tNum_(tNum & ~(UseRCX | UseRDX)) 750 , useRcx_((tNum & UseRCX) != 0) 751 , useRdx_((tNum & UseRDX) != 0) 752 , saveNum_(0) 753 , P_(0) 754 , makeEpilog_(makeEpilog) 755 , p(p_) 756 , t(t_) 757 { 758 using namespace Xbyak; 759 if (pNum < 0 || pNum > 4) XBYAK_THROW(ERR_BAD_PNUM) 760 const int allRegNum = pNum + tNum_ + (useRcx_ ? 1 : 0) + (useRdx_ ? 1 : 0); 761 if (tNum_ < 0 || allRegNum > maxRegNum) XBYAK_THROW(ERR_BAD_TNUM) 762 const Reg64& _rsp = code->rsp; 763 saveNum_ = (std::max)(0, allRegNum - noSaveNum); 764 const int *tbl = getOrderTbl() + noSaveNum; 765 for (int i = 0; i < saveNum_; i++) { 766 code->push(Reg64(tbl[i])); 767 } 768 P_ = (stackSizeByte + 7) / 8; 769 if (P_ > 0 && (P_ & 1) == (saveNum_ & 1)) P_++; // (rsp % 16) == 8, then increment P_ for 16 byte alignment 770 P_ *= 8; 771 if (P_ > 0) code->sub(_rsp, P_); 772 int pos = 0; 773 for (int i = 0; i < pNum; i++) { 774 pTbl_[i] = Xbyak::Reg64(getRegIdx(pos)); 775 } 776 for (int i = 0; i < tNum_; i++) { 777 tTbl_[i] = Xbyak::Reg64(getRegIdx(pos)); 778 } 779 if (useRcx_ && rcxPos < pNum) code_->mov(code_->r10, code_->rcx); 780 if (useRdx_ && rdxPos < pNum) code_->mov(code_->r11, code_->rdx); 781 p_.init(pTbl_, pNum); 782 t_.init(tTbl_, tNum_); 783 } 784 /* 785 make epilog manually 786 @param callRet [in] call ret() if true 787 */ 788 void close(bool callRet = true) 789 { 790 using namespace Xbyak; 791 const Reg64& _rsp = code_->rsp; 792 const int *tbl = getOrderTbl() + noSaveNum; 793 if (P_ > 0) code_->add(_rsp, P_); 794 for (int i = 0; i < saveNum_; i++) { 795 code_->pop(Reg64(tbl[saveNum_ - 1 - i])); 796 } 797 798 if (callRet) code_->ret(); 799 } ~StackFrame()800 ~StackFrame() 801 { 802 if (!makeEpilog_) return; 803 close(); 804 } 805 private: getOrderTbl()806 const int *getOrderTbl() const 807 { 808 using namespace Xbyak; 809 static const int tbl[] = { 810 #ifdef XBYAK64_WIN 811 Operand::RCX, Operand::RDX, Operand::R8, Operand::R9, Operand::R10, Operand::R11, Operand::RDI, Operand::RSI, 812 #else 813 Operand::RDI, Operand::RSI, Operand::RDX, Operand::RCX, Operand::R8, Operand::R9, Operand::R10, Operand::R11, 814 #endif 815 Operand::RBX, Operand::RBP, Operand::R12, Operand::R13, Operand::R14, Operand::R15 816 }; 817 return &tbl[0]; 818 } getRegIdx(int & pos)819 int getRegIdx(int& pos) const 820 { 821 assert(pos < maxRegNum); 822 using namespace Xbyak; 823 const int *tbl = getOrderTbl(); 824 int r = tbl[pos++]; 825 if (useRcx_) { 826 if (r == Operand::RCX) { return Operand::R10; } 827 if (r == Operand::R10) { r = tbl[pos++]; } 828 } 829 if (useRdx_) { 830 if (r == Operand::RDX) { return Operand::R11; } 831 if (r == Operand::R11) { return tbl[pos++]; } 832 } 833 return r; 834 } 835 }; 836 #endif 837 838 class Profiler { 839 int mode_; 840 const char *suffix_; 841 const void *startAddr_; 842 #ifdef XBYAK_USE_PERF 843 FILE *fp_; 844 #endif 845 public: 846 enum { 847 None = 0, 848 Perf = 1, 849 VTune = 2 850 }; Profiler()851 Profiler() 852 : mode_(None) 853 , suffix_("") 854 , startAddr_(0) 855 #ifdef XBYAK_USE_PERF 856 , fp_(0) 857 #endif 858 { 859 } 860 // append suffix to funcName setNameSuffix(const char * suffix)861 void setNameSuffix(const char *suffix) 862 { 863 suffix_ = suffix; 864 } setStartAddr(const void * startAddr)865 void setStartAddr(const void *startAddr) 866 { 867 startAddr_ = startAddr; 868 } init(int mode)869 void init(int mode) 870 { 871 mode_ = None; 872 switch (mode) { 873 default: 874 case None: 875 return; 876 case Perf: 877 #ifdef XBYAK_USE_PERF 878 close(); 879 { 880 const int pid = getpid(); 881 char name[128]; 882 snprintf(name, sizeof(name), "/tmp/perf-%d.map", pid); 883 fp_ = fopen(name, "a+"); 884 if (fp_ == 0) { 885 fprintf(stderr, "can't open %s\n", name); 886 return; 887 } 888 } 889 mode_ = Perf; 890 #endif 891 return; 892 case VTune: 893 #ifdef XBYAK_USE_VTUNE 894 dlopen("dummy", RTLD_LAZY); // force to load dlopen to enable jit profiling 895 if (iJIT_IsProfilingActive() != iJIT_SAMPLING_ON) { 896 fprintf(stderr, "VTune profiling is not active\n"); 897 return; 898 } 899 mode_ = VTune; 900 #endif 901 return; 902 } 903 } ~Profiler()904 ~Profiler() 905 { 906 close(); 907 } close()908 void close() 909 { 910 #ifdef XBYAK_USE_PERF 911 if (fp_ == 0) return; 912 fclose(fp_); 913 fp_ = 0; 914 #endif 915 } set(const char * funcName,const void * startAddr,size_t funcSize)916 void set(const char *funcName, const void *startAddr, size_t funcSize) const 917 { 918 if (mode_ == None) return; 919 #if !defined(XBYAK_USE_PERF) && !defined(XBYAK_USE_VTUNE) 920 (void)funcName; 921 (void)startAddr; 922 (void)funcSize; 923 #endif 924 #ifdef XBYAK_USE_PERF 925 if (mode_ == Perf) { 926 if (fp_ == 0) return; 927 fprintf(fp_, "%llx %zx %s%s", (long long)startAddr, funcSize, funcName, suffix_); 928 /* 929 perf does not recognize the function name which is less than 3, 930 so append '_' at the end of the name if necessary 931 */ 932 size_t n = strlen(funcName) + strlen(suffix_); 933 for (size_t i = n; i < 3; i++) { 934 fprintf(fp_, "_"); 935 } 936 fprintf(fp_, "\n"); 937 fflush(fp_); 938 } 939 #endif 940 #ifdef XBYAK_USE_VTUNE 941 if (mode_ != VTune) return; 942 char className[] = ""; 943 char fileName[] = ""; 944 iJIT_Method_Load jmethod = {}; 945 jmethod.method_id = iJIT_GetNewMethodID(); 946 jmethod.class_file_name = className; 947 jmethod.source_file_name = fileName; 948 jmethod.method_load_address = const_cast<void*>(startAddr); 949 jmethod.method_size = funcSize; 950 jmethod.line_number_size = 0; 951 char buf[128]; 952 snprintf(buf, sizeof(buf), "%s%s", funcName, suffix_); 953 jmethod.method_name = buf; 954 iJIT_NotifyEvent(iJVM_EVENT_TYPE_METHOD_LOAD_FINISHED, (void*)&jmethod); 955 #endif 956 } 957 /* 958 for continuous set 959 funcSize = endAddr - <previous set endAddr> 960 */ set(const char * funcName,const void * endAddr)961 void set(const char *funcName, const void *endAddr) 962 { 963 set(funcName, startAddr_, (size_t)endAddr - (size_t)startAddr_); 964 startAddr_ = endAddr; 965 } 966 }; 967 #endif // XBYAK_ONLY_CLASS_CPU 968 969 } } // end of util 970 971 #endif 972