1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied. See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17
18 // From Apache Impala (incubating) as of 2016-01-29.
19
20 #include "arrow/util/cpu_info.h"
21
22 #ifdef __APPLE__
23 #include <sys/sysctl.h>
24 #endif
25
26 #include <stdlib.h>
27 #include <string.h>
28
29 #ifndef _MSC_VER
30 #include <unistd.h>
31 #endif
32
33 #ifdef _WIN32
34 #include <immintrin.h>
35 #include <intrin.h>
36 #include <array>
37 #include <bitset>
38
39 #include "arrow/util/windows_compatibility.h"
40 #endif
41
42 #include <algorithm>
43 #include <cctype>
44 #include <cerrno>
45 #include <cstdint>
46 #include <fstream>
47 #include <memory>
48 #include <mutex>
49 #include <string>
50
51 #include "arrow/result.h"
52 #include "arrow/util/io_util.h"
53 #include "arrow/util/logging.h"
54 #include "arrow/util/optional.h"
55 #include "arrow/util/string.h"
56
57 namespace arrow {
58 namespace internal {
59
60 namespace {
61
62 using std::max;
63
64 constexpr int64_t kDefaultL1CacheSize = 32 * 1024; // Level 1: 32k
65 constexpr int64_t kDefaultL2CacheSize = 256 * 1024; // Level 2: 256k
66 constexpr int64_t kDefaultL3CacheSize = 3072 * 1024; // Level 3: 3M
67
68 #if defined(__MINGW64_VERSION_MAJOR) && __MINGW64_VERSION_MAJOR < 5
__cpuidex(int CPUInfo[4],int function_id,int subfunction_id)69 void __cpuidex(int CPUInfo[4], int function_id, int subfunction_id) {
70 __asm__ __volatile__("cpuid"
71 : "=a"(CPUInfo[0]), "=b"(CPUInfo[1]), "=c"(CPUInfo[2]),
72 "=d"(CPUInfo[3])
73 : "a"(function_id), "c"(subfunction_id));
74 }
75
_xgetbv(int xcr)76 int64_t _xgetbv(int xcr) {
77 int out = 0;
78 __asm__ __volatile__("xgetbv" : "=a"(out) : "c"(xcr) : "%edx");
79 return out;
80 }
81 #endif
82
83 #ifdef __APPLE__
IntegerSysCtlByName(const char * name)84 util::optional<int64_t> IntegerSysCtlByName(const char* name) {
85 size_t len = sizeof(int64_t);
86 int64_t data = 0;
87 if (sysctlbyname(name, &data, &len, nullptr, 0) == 0) {
88 return data;
89 }
90 // ENOENT is the official errno value for non-existing sysctl's,
91 // but EINVAL and ENOTSUP have been seen in the wild.
92 if (errno != ENOENT && errno != EINVAL && errno != ENOTSUP) {
93 auto st = IOErrorFromErrno(errno, "sysctlbyname failed for '", name, "'");
94 ARROW_LOG(WARNING) << st.ToString();
95 }
96 return util::nullopt;
97 }
98 #endif
99
100 #if defined(__GNUC__) && defined(__linux__) && defined(__aarch64__)
101 // There is no direct instruction to get cache size on Arm64 like '__cpuid' on x86;
102 // Get Arm64 cache size by reading '/sys/devices/system/cpu/cpu0/cache/index*/size';
103 // index* :
104 // index0: L1 Dcache
105 // index1: L1 Icache
106 // index2: L2 cache
107 // index3: L3 cache
108 const char* kL1CacheSizeFile = "/sys/devices/system/cpu/cpu0/cache/index0/size";
109 const char* kL2CacheSizeFile = "/sys/devices/system/cpu/cpu0/cache/index2/size";
110 const char* kL3CacheSizeFile = "/sys/devices/system/cpu/cpu0/cache/index3/size";
111
GetArm64CacheSize(const char * filename,int64_t default_size=-1)112 int64_t GetArm64CacheSize(const char* filename, int64_t default_size = -1) {
113 char* content = nullptr;
114 char* last_char = nullptr;
115 size_t file_len = 0;
116
117 // Read cache file to 'content' for getting cache size.
118 FILE* cache_file = fopen(filename, "r");
119 if (cache_file == nullptr) {
120 return default_size;
121 }
122 int res = getline(&content, &file_len, cache_file);
123 fclose(cache_file);
124 if (res == -1) {
125 return default_size;
126 }
127 std::unique_ptr<char, decltype(&free)> content_guard(content, &free);
128
129 errno = 0;
130 const auto cardinal_num = strtoull(content, &last_char, 0);
131 if (errno != 0) {
132 return default_size;
133 }
134 // kB, MB, or GB
135 int64_t multip = 1;
136 switch (*last_char) {
137 case 'g':
138 case 'G':
139 multip *= 1024;
140 case 'm':
141 case 'M':
142 multip *= 1024;
143 case 'k':
144 case 'K':
145 multip *= 1024;
146 }
147 return cardinal_num * multip;
148 }
149 #endif
150
151 #if !defined(_WIN32) && !defined(__APPLE__)
152 struct {
153 std::string name;
154 int64_t flag;
155 } flag_mappings[] = {
156 #if (defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64))
157 {"ssse3", CpuInfo::SSSE3}, {"sse4_1", CpuInfo::SSE4_1},
158 {"sse4_2", CpuInfo::SSE4_2}, {"popcnt", CpuInfo::POPCNT},
159 {"avx", CpuInfo::AVX}, {"avx2", CpuInfo::AVX2},
160 {"avx512f", CpuInfo::AVX512F}, {"avx512cd", CpuInfo::AVX512CD},
161 {"avx512vl", CpuInfo::AVX512VL}, {"avx512dq", CpuInfo::AVX512DQ},
162 {"avx512bw", CpuInfo::AVX512BW}, {"bmi1", CpuInfo::BMI1},
163 {"bmi2", CpuInfo::BMI2},
164 #endif
165 #if defined(__aarch64__)
166 {"asimd", CpuInfo::ASIMD},
167 #endif
168 };
169 const int64_t num_flags = sizeof(flag_mappings) / sizeof(flag_mappings[0]);
170
171 // Helper function to parse for hardware flags.
172 // values contains a list of space-separated flags. check to see if the flags we
173 // care about are present.
174 // Returns a bitmap of flags.
ParseCPUFlags(const std::string & values)175 int64_t ParseCPUFlags(const std::string& values) {
176 int64_t flags = 0;
177 for (int i = 0; i < num_flags; ++i) {
178 if (values.find(flag_mappings[i].name) != std::string::npos) {
179 flags |= flag_mappings[i].flag;
180 }
181 }
182 return flags;
183 }
184 #endif
185
186 #ifdef _WIN32
RetrieveCacheSize(int64_t * cache_sizes)187 bool RetrieveCacheSize(int64_t* cache_sizes) {
188 if (!cache_sizes) {
189 return false;
190 }
191 PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = nullptr;
192 PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer_position = nullptr;
193 DWORD buffer_size = 0;
194 size_t offset = 0;
195 typedef BOOL(WINAPI * GetLogicalProcessorInformationFuncPointer)(void*, void*);
196 GetLogicalProcessorInformationFuncPointer func_pointer =
197 (GetLogicalProcessorInformationFuncPointer)GetProcAddress(
198 GetModuleHandle("kernel32"), "GetLogicalProcessorInformation");
199
200 if (!func_pointer) {
201 return false;
202 }
203
204 // Get buffer size
205 if (func_pointer(buffer, &buffer_size) && GetLastError() != ERROR_INSUFFICIENT_BUFFER)
206 return false;
207
208 buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(buffer_size);
209
210 if (!buffer || !func_pointer(buffer, &buffer_size)) {
211 return false;
212 }
213
214 buffer_position = buffer;
215 while (offset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= buffer_size) {
216 if (RelationCache == buffer_position->Relationship) {
217 PCACHE_DESCRIPTOR cache = &buffer_position->Cache;
218 if (cache->Level >= 1 && cache->Level <= 3) {
219 cache_sizes[cache->Level - 1] += cache->Size;
220 }
221 }
222 offset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
223 buffer_position++;
224 }
225
226 if (buffer) {
227 free(buffer);
228 }
229 return true;
230 }
231
232 // Source: https://en.wikipedia.org/wiki/CPUID
RetrieveCPUInfo(int64_t * hardware_flags,std::string * model_name,CpuInfo::Vendor * vendor)233 bool RetrieveCPUInfo(int64_t* hardware_flags, std::string* model_name,
234 CpuInfo::Vendor* vendor) {
235 if (!hardware_flags || !model_name || !vendor) {
236 return false;
237 }
238 int register_EAX_id = 1;
239 int highest_valid_id = 0;
240 int highest_extended_valid_id = 0;
241 std::bitset<32> features_ECX;
242 std::array<int, 4> cpu_info;
243
244 // Get highest valid id
245 __cpuid(cpu_info.data(), 0);
246 highest_valid_id = cpu_info[0];
247 // HEX of "GenuineIntel": 47656E75 696E6549 6E74656C
248 // HEX of "AuthenticAMD": 41757468 656E7469 63414D44
249 if (cpu_info[1] == 0x756e6547 && cpu_info[2] == 0x49656e69 &&
250 cpu_info[3] == 0x6c65746e) {
251 *vendor = CpuInfo::Vendor::Intel;
252 } else if (cpu_info[1] == 0x68747541 && cpu_info[2] == 0x69746e65 &&
253 cpu_info[3] == 0x444d4163) {
254 *vendor = CpuInfo::Vendor::AMD;
255 }
256
257 if (highest_valid_id <= register_EAX_id) return false;
258
259 // EAX=1: Processor Info and Feature Bits
260 __cpuidex(cpu_info.data(), register_EAX_id, 0);
261 features_ECX = cpu_info[2];
262
263 // Get highest extended id
264 __cpuid(cpu_info.data(), 0x80000000);
265 highest_extended_valid_id = cpu_info[0];
266
267 // Retrieve CPU model name
268 if (highest_extended_valid_id >= static_cast<int>(0x80000004)) {
269 model_name->clear();
270 for (int i = 0x80000002; i <= static_cast<int>(0x80000004); ++i) {
271 __cpuidex(cpu_info.data(), i, 0);
272 *model_name +=
273 std::string(reinterpret_cast<char*>(cpu_info.data()), sizeof(cpu_info));
274 }
275 }
276
277 bool zmm_enabled = false;
278 if (features_ECX[27]) { // OSXSAVE
279 // Query if the OS supports saving ZMM registers when switching contexts
280 int64_t xcr0 = _xgetbv(0);
281 zmm_enabled = (xcr0 & 0xE0) == 0xE0;
282 }
283
284 if (features_ECX[9]) *hardware_flags |= CpuInfo::SSSE3;
285 if (features_ECX[19]) *hardware_flags |= CpuInfo::SSE4_1;
286 if (features_ECX[20]) *hardware_flags |= CpuInfo::SSE4_2;
287 if (features_ECX[23]) *hardware_flags |= CpuInfo::POPCNT;
288 if (features_ECX[23]) *hardware_flags |= CpuInfo::AVX;
289
290 // cpuid with EAX=7, ECX=0: Extended Features
291 register_EAX_id = 7;
292 if (highest_valid_id > register_EAX_id) {
293 __cpuidex(cpu_info.data(), register_EAX_id, 0);
294 std::bitset<32> features_EBX = cpu_info[1];
295
296 if (features_EBX[3]) *hardware_flags |= CpuInfo::BMI1;
297 if (features_EBX[5]) *hardware_flags |= CpuInfo::AVX2;
298 if (features_EBX[8]) *hardware_flags |= CpuInfo::BMI2;
299 // ARROW-11427: only use AVX512 if enabled by the OS
300 if (zmm_enabled) {
301 if (features_EBX[16]) *hardware_flags |= CpuInfo::AVX512F;
302 if (features_EBX[17]) *hardware_flags |= CpuInfo::AVX512DQ;
303 if (features_EBX[28]) *hardware_flags |= CpuInfo::AVX512CD;
304 if (features_EBX[30]) *hardware_flags |= CpuInfo::AVX512BW;
305 if (features_EBX[31]) *hardware_flags |= CpuInfo::AVX512VL;
306 }
307 }
308
309 return true;
310 }
311 #endif
312
313 } // namespace
314
CpuInfo()315 CpuInfo::CpuInfo()
316 : hardware_flags_(0),
317 num_cores_(1),
318 model_name_("unknown"),
319 vendor_(Vendor::Unknown) {}
320
321 std::unique_ptr<CpuInfo> g_cpu_info;
322 static std::once_flag cpuinfo_initialized;
323
GetInstance()324 CpuInfo* CpuInfo::GetInstance() {
325 std::call_once(cpuinfo_initialized, []() {
326 g_cpu_info.reset(new CpuInfo);
327 g_cpu_info->Init();
328 });
329 return g_cpu_info.get();
330 }
331
Init()332 void CpuInfo::Init() {
333 std::string line;
334 std::string name;
335 std::string value;
336
337 float max_mhz = 0;
338 int num_cores = 0;
339
340 memset(&cache_sizes_, 0, sizeof(cache_sizes_));
341
342 #ifdef _WIN32
343 SYSTEM_INFO system_info;
344 GetSystemInfo(&system_info);
345 num_cores = system_info.dwNumberOfProcessors;
346
347 LARGE_INTEGER performance_frequency;
348 if (QueryPerformanceFrequency(&performance_frequency)) {
349 max_mhz = static_cast<float>(performance_frequency.QuadPart);
350 }
351 #elif defined(__APPLE__)
352 // On macOS, get CPU information from system information base
353 struct SysCtlCpuFeature {
354 const char* name;
355 int64_t flag;
356 };
357 std::vector<SysCtlCpuFeature> features = {
358 #if defined(__aarch64__)
359 // ARM64 (note that this is exposed under Rosetta as well)
360 {"hw.optional.neon", ASIMD},
361 #else
362 // x86
363 {"hw.optional.sse4_2", SSSE3 | SSE4_1 | SSE4_2 | POPCNT},
364 {"hw.optional.avx1_0", AVX},
365 {"hw.optional.avx2_0", AVX2},
366 {"hw.optional.bmi1", BMI1},
367 {"hw.optional.bmi2", BMI2},
368 {"hw.optional.avx512f", AVX512F},
369 {"hw.optional.avx512cd", AVX512CD},
370 {"hw.optional.avx512dq", AVX512DQ},
371 {"hw.optional.avx512bw", AVX512BW},
372 {"hw.optional.avx512vl", AVX512VL},
373 #endif
374 };
375 for (const auto& feature : features) {
376 auto v = IntegerSysCtlByName(feature.name);
377 if (v.value_or(0)) {
378 hardware_flags_ |= feature.flag;
379 }
380 }
381 #else
382 // Read from /proc/cpuinfo
383 std::ifstream cpuinfo("/proc/cpuinfo", std::ios::in);
384 while (cpuinfo) {
385 std::getline(cpuinfo, line);
386 size_t colon = line.find(':');
387 if (colon != std::string::npos) {
388 name = TrimString(line.substr(0, colon - 1));
389 value = TrimString(line.substr(colon + 1, std::string::npos));
390 if (name.compare("flags") == 0 || name.compare("Features") == 0) {
391 hardware_flags_ |= ParseCPUFlags(value);
392 } else if (name.compare("cpu MHz") == 0) {
393 // Every core will report a different speed. We'll take the max, assuming
394 // that when impala is running, the core will not be in a lower power state.
395 // TODO: is there a more robust way to do this, such as
396 // Window's QueryPerformanceFrequency()
397 float mhz = static_cast<float>(atof(value.c_str()));
398 max_mhz = max(mhz, max_mhz);
399 } else if (name.compare("processor") == 0) {
400 ++num_cores;
401 } else if (name.compare("model name") == 0) {
402 model_name_ = value;
403 } else if (name.compare("vendor_id") == 0) {
404 if (value.compare("GenuineIntel") == 0) {
405 vendor_ = Vendor::Intel;
406 } else if (value.compare("AuthenticAMD") == 0) {
407 vendor_ = Vendor::AMD;
408 }
409 }
410 }
411 }
412 if (cpuinfo.is_open()) cpuinfo.close();
413 #endif
414
415 #ifdef __APPLE__
416 // On macOS, get cache size from system information base
417 SetDefaultCacheSize();
418 auto c = IntegerSysCtlByName("hw.l1dcachesize");
419 if (c.has_value()) {
420 cache_sizes_[0] = *c;
421 }
422 c = IntegerSysCtlByName("hw.l2cachesize");
423 if (c.has_value()) {
424 cache_sizes_[1] = *c;
425 }
426 c = IntegerSysCtlByName("hw.l3cachesize");
427 if (c.has_value()) {
428 cache_sizes_[2] = *c;
429 }
430 #elif _WIN32
431 if (!RetrieveCacheSize(cache_sizes_)) {
432 SetDefaultCacheSize();
433 }
434 RetrieveCPUInfo(&hardware_flags_, &model_name_, &vendor_);
435 #else
436 SetDefaultCacheSize();
437 #endif
438
439 if (max_mhz != 0) {
440 cycles_per_ms_ = static_cast<int64_t>(max_mhz);
441 #ifndef _WIN32
442 cycles_per_ms_ *= 1000;
443 #endif
444 } else {
445 cycles_per_ms_ = 1000000;
446 }
447 original_hardware_flags_ = hardware_flags_;
448
449 if (num_cores > 0) {
450 num_cores_ = num_cores;
451 } else {
452 num_cores_ = 1;
453 }
454
455 // Parse the user simd level
456 ParseUserSimdLevel();
457 }
458
VerifyCpuRequirements()459 void CpuInfo::VerifyCpuRequirements() {
460 #ifdef ARROW_HAVE_SSE4_2
461 if (!IsSupported(CpuInfo::SSSE3)) {
462 DCHECK(false) << "CPU does not support the Supplemental SSE3 instruction set";
463 }
464 #endif
465 #if defined(ARROW_HAVE_NEON)
466 if (!IsSupported(CpuInfo::ASIMD)) {
467 DCHECK(false) << "CPU does not support the Armv8 Neon instruction set";
468 }
469 #endif
470 }
471
CanUseSSE4_2() const472 bool CpuInfo::CanUseSSE4_2() const {
473 #if defined(ARROW_HAVE_SSE4_2)
474 return IsSupported(CpuInfo::SSE4_2);
475 #else
476 return false;
477 #endif
478 }
479
EnableFeature(int64_t flag,bool enable)480 void CpuInfo::EnableFeature(int64_t flag, bool enable) {
481 if (!enable) {
482 hardware_flags_ &= ~flag;
483 } else {
484 // Can't turn something on that can't be supported
485 DCHECK_NE(original_hardware_flags_ & flag, 0);
486 hardware_flags_ |= flag;
487 }
488 }
489
hardware_flags()490 int64_t CpuInfo::hardware_flags() { return hardware_flags_; }
491
CacheSize(CacheLevel level)492 int64_t CpuInfo::CacheSize(CacheLevel level) { return cache_sizes_[level]; }
493
cycles_per_ms()494 int64_t CpuInfo::cycles_per_ms() { return cycles_per_ms_; }
495
num_cores()496 int CpuInfo::num_cores() { return num_cores_; }
497
model_name()498 std::string CpuInfo::model_name() { return model_name_; }
499
SetDefaultCacheSize()500 void CpuInfo::SetDefaultCacheSize() {
501 #if defined(_SC_LEVEL1_DCACHE_SIZE) && !defined(__aarch64__)
502 // Call sysconf to query for the cache sizes
503 cache_sizes_[0] = sysconf(_SC_LEVEL1_DCACHE_SIZE);
504 cache_sizes_[1] = sysconf(_SC_LEVEL2_CACHE_SIZE);
505 cache_sizes_[2] = sysconf(_SC_LEVEL3_CACHE_SIZE);
506 ARROW_UNUSED(kDefaultL1CacheSize);
507 ARROW_UNUSED(kDefaultL2CacheSize);
508 ARROW_UNUSED(kDefaultL3CacheSize);
509 #elif defined(__GNUC__) && defined(__linux__) && defined(__aarch64__)
510 cache_sizes_[0] = GetArm64CacheSize(kL1CacheSizeFile, kDefaultL1CacheSize);
511 cache_sizes_[1] = GetArm64CacheSize(kL2CacheSizeFile, kDefaultL2CacheSize);
512 cache_sizes_[2] = GetArm64CacheSize(kL3CacheSizeFile, kDefaultL3CacheSize);
513 #else
514 // Provide reasonable default values if no info
515 cache_sizes_[0] = kDefaultL1CacheSize;
516 cache_sizes_[1] = kDefaultL2CacheSize;
517 cache_sizes_[2] = kDefaultL3CacheSize;
518 #endif
519 }
520
ParseUserSimdLevel()521 void CpuInfo::ParseUserSimdLevel() {
522 auto maybe_env_var = GetEnvVar("ARROW_USER_SIMD_LEVEL");
523 if (!maybe_env_var.ok()) {
524 // No user settings
525 return;
526 }
527 std::string s = *std::move(maybe_env_var);
528 std::transform(s.begin(), s.end(), s.begin(),
529 [](unsigned char c) { return std::toupper(c); });
530
531 int level = USER_SIMD_MAX;
532 // Parse the level
533 if (s == "AVX512") {
534 level = USER_SIMD_AVX512;
535 } else if (s == "AVX2") {
536 level = USER_SIMD_AVX2;
537 } else if (s == "AVX") {
538 level = USER_SIMD_AVX;
539 } else if (s == "SSE4_2") {
540 level = USER_SIMD_SSE4_2;
541 } else if (s == "NONE") {
542 level = USER_SIMD_NONE;
543 } else if (!s.empty()) {
544 ARROW_LOG(WARNING) << "Invalid value for ARROW_USER_SIMD_LEVEL: " << s;
545 }
546
547 // Disable feature as the level
548 if (level < USER_SIMD_AVX512) { // Disable all AVX512 features
549 EnableFeature(AVX512, false);
550 }
551 if (level < USER_SIMD_AVX2) { // Disable all AVX2 features
552 EnableFeature(AVX2 | BMI2, false);
553 }
554 if (level < USER_SIMD_AVX) { // Disable all AVX features
555 EnableFeature(AVX, false);
556 }
557 if (level < USER_SIMD_SSE4_2) { // Disable all SSE4_2 features
558 EnableFeature(SSE4_2 | BMI1, false);
559 }
560 }
561
562 } // namespace internal
563 } // namespace arrow
564