1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14
15 #include "cpu.h"
16
17 #include "platform.h"
18
19 #include <limits.h>
20 #include <stdio.h>
21 #include <string.h>
22
23 #ifdef _OPENMP
24 #if NCNN_SIMPLEOMP
25 #include "simpleomp.h"
26 #else
27 #include <omp.h>
28 #endif
29 #endif
30
31 #ifdef _MSC_VER
32 #include <intrin.h> // __cpuid()
33 #include <immintrin.h> // _xgetbv()
34 #endif
35
36 #ifdef __EMSCRIPTEN__
37 #include <emscripten/threading.h>
38 #endif
39
40 #if defined __ANDROID__ || defined __linux__
41 #if defined __ANDROID__
42 #include <dlfcn.h>
43 #endif
44 #include <stdint.h>
45 #include <sys/syscall.h>
46 #include <unistd.h>
47 #endif
48
49 #if __APPLE__
50 #include <mach/mach.h>
51 #include <mach/machine.h>
52 #include <mach/thread_act.h>
53 #include <sys/sysctl.h>
54 #include <sys/types.h>
55 #include "TargetConditionals.h"
56 #if TARGET_OS_IPHONE
57 #define __IOS__ 1
58 #endif
59 // define missing cpu model for old sdk
60 #ifndef CPUFAMILY_ARM_HURRICANE
61 #define CPUFAMILY_ARM_HURRICANE 0x67ceee93
62 #endif
63 // A11
64 #ifndef CPUFAMILY_ARM_MONSOON_MISTRAL
65 #define CPUFAMILY_ARM_MONSOON_MISTRAL 0xe81e7ef6
66 #endif
67 // A12
68 #ifndef CPUFAMILY_ARM_VORTEX_TEMPEST
69 #define CPUFAMILY_ARM_VORTEX_TEMPEST 0x07d34b9f
70 #endif
71 // A13
72 #ifndef CPUFAMILY_ARM_LIGHTNING_THUNDER
73 #define CPUFAMILY_ARM_LIGHTNING_THUNDER 0x462504d2
74 #endif
75 // A14
76 #ifndef CPUFAMILY_ARM_FIRESTORM_ICESTORM
77 #define CPUFAMILY_ARM_FIRESTORM_ICESTORM 0x1b588bb3
78 #endif
79 // A15
80 #ifndef CPUFAMILY_ARM_AVALANCHE_BLIZZARD
81 #define CPUFAMILY_ARM_AVALANCHE_BLIZZARD 0xda33d83d
82 #endif
83 // M1
84 #ifndef CPUFAMILY_AARCH64_FIRESTORM_ICESTORM
85 #define CPUFAMILY_AARCH64_FIRESTORM_ICESTORM 0x1b588bb3
86 #endif
87 #endif // __APPLE__
88
89 #if defined(__SSE3__)
90 #include <immintrin.h>
91 #endif
92
93 namespace ncnn {
94
95 #if defined __ANDROID__ || defined __linux__
96
97 #define AT_HWCAP 16
98 #define AT_HWCAP2 26
99
100 #if defined __ANDROID__
101 // Probe the system's C library for a 'getauxval' function and call it if
102 // it exits, or return 0 for failure. This function is available since API
103 // level 20.
104 //
105 // This code does *NOT* check for '__ANDROID_API__ >= 20' to support the
106 // edge case where some NDK developers use headers for a platform that is
107 // newer than the one really targetted by their application.
108 // This is typically done to use newer native APIs only when running on more
109 // recent Android versions, and requires careful symbol management.
110 //
111 // Note that getauxval() can't really be re-implemented here, because
112 // its implementation does not parse /proc/self/auxv. Instead it depends
113 // on values that are passed by the kernel at process-init time to the
114 // C runtime initialization layer.
get_elf_hwcap_from_getauxval()115 static unsigned int get_elf_hwcap_from_getauxval()
116 {
117 typedef unsigned long getauxval_func_t(unsigned long);
118
119 dlerror();
120 void* libc_handle = dlopen("libc.so", RTLD_NOW);
121 if (!libc_handle)
122 {
123 NCNN_LOGE("dlopen libc.so failed %s", dlerror());
124 return 0;
125 }
126
127 unsigned int result = 0;
128 getauxval_func_t* func = (getauxval_func_t*)dlsym(libc_handle, "getauxval");
129 if (!func)
130 {
131 NCNN_LOGE("dlsym getauxval failed");
132 }
133 else
134 {
135 // Note: getauxval() returns 0 on failure. Doesn't touch errno.
136 result = (unsigned int)(*func)(AT_HWCAP);
137 }
138 dlclose(libc_handle);
139
140 return result;
141 }
142 #endif // defined __ANDROID__
143
144 // extract the ELF HW capabilities bitmap from /proc/self/auxv
get_elf_hwcap_from_proc_self_auxv()145 static unsigned int get_elf_hwcap_from_proc_self_auxv()
146 {
147 FILE* fp = fopen("/proc/self/auxv", "rb");
148 if (!fp)
149 {
150 NCNN_LOGE("fopen /proc/self/auxv failed");
151 return 0;
152 }
153
154 #if __aarch64__ || __riscv_xlen == 64
155 struct
156 {
157 uint64_t tag;
158 uint64_t value;
159 } entry;
160 #else
161 struct
162 {
163 unsigned int tag;
164 unsigned int value;
165 } entry;
166
167 #endif
168
169 unsigned int result = 0;
170 while (!feof(fp))
171 {
172 int nread = fread((char*)&entry, sizeof(entry), 1, fp);
173 if (nread != 1)
174 break;
175
176 if (entry.tag == 0 && entry.value == 0)
177 break;
178
179 if (entry.tag == AT_HWCAP)
180 {
181 result = entry.value;
182 break;
183 }
184 }
185
186 fclose(fp);
187
188 return result;
189 }
190
get_elf_hwcap()191 static unsigned int get_elf_hwcap()
192 {
193 #if defined __ANDROID__
194 unsigned int hwcap = get_elf_hwcap_from_getauxval();
195 if (hwcap)
196 return hwcap;
197 #endif
198
199 return get_elf_hwcap_from_proc_self_auxv();
200 }
201
202 static unsigned int g_hwcaps = get_elf_hwcap();
203
204 #if __aarch64__
205 // from arch/arm64/include/uapi/asm/hwcap.h
206 #define HWCAP_ASIMD (1 << 1)
207 #define HWCAP_ASIMDHP (1 << 10)
208 #define HWCAP_ASIMDDP (1 << 20)
209 #else
210 // from arch/arm/include/uapi/asm/hwcap.h
211 #define HWCAP_NEON (1 << 12)
212 #define HWCAP_VFPv4 (1 << 16)
213 #endif
214
215 #if __mips__
216 // from arch/mips/include/uapi/asm/hwcap.h
217 #define HWCAP_MIPS_MSA (1 << 1)
218 #define HWCAP_LOONGSON_MMI (1 << 11)
219 #endif
220
221 #if __riscv
222 // from arch/riscv/include/uapi/asm/hwcap.h
223 #define COMPAT_HWCAP_ISA_F (1 << ('F' - 'A'))
224 #define COMPAT_HWCAP_ISA_V (1 << ('V' - 'A'))
225 #endif
226
227 #endif // defined __ANDROID__ || defined __linux__
228
229 #if __APPLE__
get_hw_cpufamily()230 static unsigned int get_hw_cpufamily()
231 {
232 unsigned int value = 0;
233 size_t len = sizeof(value);
234 sysctlbyname("hw.cpufamily", &value, &len, NULL, 0);
235 return value;
236 }
237
get_hw_cputype()238 static cpu_type_t get_hw_cputype()
239 {
240 cpu_type_t value = 0;
241 size_t len = sizeof(value);
242 sysctlbyname("hw.cputype", &value, &len, NULL, 0);
243 return value;
244 }
245
get_hw_cpusubtype()246 static cpu_subtype_t get_hw_cpusubtype()
247 {
248 cpu_subtype_t value = 0;
249 size_t len = sizeof(value);
250 sysctlbyname("hw.cpusubtype", &value, &len, NULL, 0);
251 return value;
252 }
253
254 static unsigned int g_hw_cpufamily = get_hw_cpufamily();
255 static cpu_type_t g_hw_cputype = get_hw_cputype();
256 static cpu_subtype_t g_hw_cpusubtype = get_hw_cpusubtype();
257 #endif // __APPLE__
258
259 #if defined __ANDROID__ || defined __linux__
CpuSet()260 CpuSet::CpuSet()
261 {
262 disable_all();
263 }
264
enable(int cpu)265 void CpuSet::enable(int cpu)
266 {
267 CPU_SET(cpu, &cpu_set);
268 }
269
disable(int cpu)270 void CpuSet::disable(int cpu)
271 {
272 CPU_CLR(cpu, &cpu_set);
273 }
274
disable_all()275 void CpuSet::disable_all()
276 {
277 CPU_ZERO(&cpu_set);
278 }
279
is_enabled(int cpu) const280 bool CpuSet::is_enabled(int cpu) const
281 {
282 return CPU_ISSET(cpu, &cpu_set);
283 }
284
num_enabled() const285 int CpuSet::num_enabled() const
286 {
287 int num_enabled = 0;
288 for (int i = 0; i < (int)sizeof(cpu_set_t) * 8; i++)
289 {
290 if (is_enabled(i))
291 num_enabled++;
292 }
293
294 return num_enabled;
295 }
296 #elif __APPLE__
CpuSet()297 CpuSet::CpuSet()
298 {
299 disable_all();
300 }
301
enable(int cpu)302 void CpuSet::enable(int cpu)
303 {
304 policy |= (1 << cpu);
305 }
306
disable(int cpu)307 void CpuSet::disable(int cpu)
308 {
309 policy &= ~(1 << cpu);
310 }
311
disable_all()312 void CpuSet::disable_all()
313 {
314 policy = 0;
315 }
316
is_enabled(int cpu) const317 bool CpuSet::is_enabled(int cpu) const
318 {
319 return policy & (1 << cpu);
320 }
321
num_enabled() const322 int CpuSet::num_enabled() const
323 {
324 int num_enabled = 0;
325 for (int i = 0; i < (int)sizeof(policy) * 8; i++)
326 {
327 if (is_enabled(i))
328 num_enabled++;
329 }
330
331 return num_enabled;
332 }
333 #else
CpuSet()334 CpuSet::CpuSet()
335 {
336 }
337
enable(int)338 void CpuSet::enable(int /* cpu */)
339 {
340 }
341
disable(int)342 void CpuSet::disable(int /* cpu */)
343 {
344 }
345
disable_all()346 void CpuSet::disable_all()
347 {
348 }
349
is_enabled(int) const350 bool CpuSet::is_enabled(int /* cpu */) const
351 {
352 return true;
353 }
354
num_enabled() const355 int CpuSet::num_enabled() const
356 {
357 return get_cpu_count();
358 }
359 #endif
360
cpu_support_arm_neon()361 int cpu_support_arm_neon()
362 {
363 #if defined __ANDROID__ || defined __linux__
364 #if __aarch64__
365 return g_hwcaps & HWCAP_ASIMD;
366 #else
367 return g_hwcaps & HWCAP_NEON;
368 #endif
369 #elif __APPLE__
370 #if __aarch64__
371 return g_hw_cputype == CPU_TYPE_ARM64;
372 #else
373 return g_hw_cputype == CPU_TYPE_ARM && g_hw_cpusubtype > CPU_SUBTYPE_ARM_V7;
374 #endif
375 #else
376 return 0;
377 #endif
378 }
379
cpu_support_arm_vfpv4()380 int cpu_support_arm_vfpv4()
381 {
382 #if defined __ANDROID__ || defined __linux__
383 #if __aarch64__
384 // neon always enable fma and fp16
385 return g_hwcaps & HWCAP_ASIMD;
386 #else
387 return g_hwcaps & HWCAP_VFPv4;
388 #endif
389 #elif __APPLE__
390 #if __aarch64__
391 return g_hw_cputype == CPU_TYPE_ARM64;
392 #else
393 return g_hw_cputype == CPU_TYPE_ARM && g_hw_cpusubtype > CPU_SUBTYPE_ARM_V7S;
394 #endif
395 #else
396 return 0;
397 #endif
398 }
399
cpu_support_arm_asimdhp()400 int cpu_support_arm_asimdhp()
401 {
402 #if defined __ANDROID__ || defined __linux__
403 #if __aarch64__
404 return g_hwcaps & HWCAP_ASIMDHP;
405 #else
406 return 0;
407 #endif
408 #elif __APPLE__
409 #if __aarch64__
410 return g_hw_cpufamily == CPUFAMILY_ARM_MONSOON_MISTRAL || g_hw_cpufamily == CPUFAMILY_ARM_VORTEX_TEMPEST || g_hw_cpufamily == CPUFAMILY_ARM_LIGHTNING_THUNDER || g_hw_cpufamily == CPUFAMILY_ARM_FIRESTORM_ICESTORM || g_hw_cpufamily == CPUFAMILY_ARM_AVALANCHE_BLIZZARD;
411 #else
412 return 0;
413 #endif
414 #else
415 return 0;
416 #endif
417 }
418
cpu_support_arm_asimddp()419 int cpu_support_arm_asimddp()
420 {
421 #if defined __ANDROID__ || defined __linux__
422 #if __aarch64__
423 return g_hwcaps & HWCAP_ASIMDDP;
424 #else
425 return 0;
426 #endif
427 #elif __APPLE__
428 #if __aarch64__
429 return g_hw_cpufamily == CPUFAMILY_ARM_LIGHTNING_THUNDER || g_hw_cpufamily == CPUFAMILY_ARM_FIRESTORM_ICESTORM || g_hw_cpufamily == CPUFAMILY_ARM_AVALANCHE_BLIZZARD;
430 #else
431 return 0;
432 #endif
433 #else
434 return 0;
435 #endif
436 }
437
cpu_support_x86_avx2()438 int cpu_support_x86_avx2()
439 {
440 #if !NCNN_AVX2
441 return 0;
442 #endif
443 #if (_M_AMD64 || __x86_64__) || (_M_IX86 || __i386__)
444 #if defined(_MSC_VER)
445 // TODO move to init function
446 int cpu_info[4];
447 __cpuid(cpu_info, 0);
448
449 int nIds = cpu_info[0];
450 if (nIds < 7)
451 return 0;
452
453 __cpuid(cpu_info, 1);
454 // check AVX XSAVE OSXSAVE
455 if (!(cpu_info[2] & 0x10000000) || !(cpu_info[2] & 0x04000000) || !(cpu_info[2] & 0x08000000))
456 return 0;
457
458 // check XSAVE enabled by kernel
459 if ((_xgetbv(0) & 6) != 6)
460 return 0;
461
462 __cpuid(cpu_info, 7);
463 return cpu_info[1] & 0x00000020;
464 #elif defined(__clang__)
465 #if __clang_major__ >= 6
466 __builtin_cpu_init();
467 #endif
468 return __builtin_cpu_supports("avx2");
469 #elif __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
470 __builtin_cpu_init();
471 return __builtin_cpu_supports("avx2");
472 #else
473 // TODO: other x86 compilers checking avx2 here
474 NCNN_LOGE("AVX2 detection method is unknown for current compiler");
475 return 0;
476 #endif
477 #else
478 return 0;
479 #endif
480 }
481
cpu_support_x86_avx()482 int cpu_support_x86_avx()
483 {
484 #if !NCNN_AVX
485 return 0;
486 #endif
487 #if (_M_AMD64 || __x86_64__) || (_M_IX86 || __i386__)
488 #if defined(_MSC_VER)
489 // TODO move to init function
490 int cpu_info[4];
491 __cpuid(cpu_info, 0);
492
493 int nIds = cpu_info[0];
494 if (nIds < 7)
495 return 0;
496
497 __cpuid(cpu_info, 1);
498 // check AVX XSAVE OSXSAVE
499 if (!(cpu_info[2] & 0x10000000) || !(cpu_info[2] & 0x04000000) || !(cpu_info[2] & 0x08000000))
500 return 0;
501
502 // check XSAVE enabled by kernel
503 if ((_xgetbv(0) & 6) != 6)
504 return 0;
505 return 1;
506 #elif defined(__clang__)
507 #if __clang_major__ >= 6
508 __builtin_cpu_init();
509 #endif
510 return __builtin_cpu_supports("avx");
511 #elif __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
512 __builtin_cpu_init();
513 return __builtin_cpu_supports("avx");
514 #else
515 // TODO: other x86 compilers checking avx here
516 NCNN_LOGE("AVX detection method is unknown for current compiler");
517 return 0;
518 #endif
519 #else
520 return 0;
521 #endif
522 }
523
cpu_support_mips_msa()524 int cpu_support_mips_msa()
525 {
526 #if defined __ANDROID__ || defined __linux__
527 #if __mips__
528 return g_hwcaps & HWCAP_MIPS_MSA;
529 #else
530 return 0;
531 #endif
532 #else
533 return 0;
534 #endif
535 }
536
cpu_support_loongson_mmi()537 int cpu_support_loongson_mmi()
538 {
539 #if defined __ANDROID__ || defined __linux__
540 #if __mips__
541 return g_hwcaps & HWCAP_LOONGSON_MMI;
542 #else
543 return 0;
544 #endif
545 #else
546 return 0;
547 #endif
548 }
549
cpu_support_riscv_v()550 int cpu_support_riscv_v()
551 {
552 #if defined __ANDROID__ || defined __linux__
553 #if __riscv
554 return g_hwcaps & COMPAT_HWCAP_ISA_V;
555 #else
556 return 0;
557 #endif
558 #else
559 return 0;
560 #endif
561 }
562
cpu_support_riscv_zfh()563 int cpu_support_riscv_zfh()
564 {
565 #if defined __ANDROID__ || defined __linux__
566 #if __riscv
567 // v + f does not imply zfh, but how to discover zfh properly ?
568 // upstream issue https://github.com/riscv/riscv-isa-manual/issues/414
569 return g_hwcaps & COMPAT_HWCAP_ISA_V && g_hwcaps & COMPAT_HWCAP_ISA_F;
570 #else
571 return 0;
572 #endif
573 #else
574 return 0;
575 #endif
576 }
577
cpu_riscv_vlenb()578 int cpu_riscv_vlenb()
579 {
580 #if __riscv
581 if (!cpu_support_riscv_v())
582 return 0;
583
584 int a = 0;
585 asm volatile(
586 ".word 0xc22026f3 \n" // csrr a3, vlenb
587 "mv %0, a3 \n"
588 : "=r"(a)
589 :
590 : "memory", "a3");
591 return a;
592 #else
593 return 0;
594 #endif
595 }
596
get_cpucount()597 static int get_cpucount()
598 {
599 int count = 0;
600 #ifdef __EMSCRIPTEN__
601 if (emscripten_has_threading_support())
602 count = emscripten_num_logical_cores();
603 else
604 count = 1;
605 #elif defined __ANDROID__ || defined __linux__
606 // get cpu count from /proc/cpuinfo
607 FILE* fp = fopen("/proc/cpuinfo", "rb");
608 if (!fp)
609 return 1;
610
611 char line[1024];
612 while (!feof(fp))
613 {
614 char* s = fgets(line, 1024, fp);
615 if (!s)
616 break;
617
618 if (memcmp(line, "processor", 9) == 0)
619 {
620 count++;
621 }
622 }
623
624 fclose(fp);
625 #elif __APPLE__
626 size_t len = sizeof(count);
627 sysctlbyname("hw.ncpu", &count, &len, NULL, 0);
628 #else
629 #ifdef _OPENMP
630 count = omp_get_max_threads();
631 #else
632 count = 1;
633 #endif // _OPENMP
634 #endif
635
636 if (count < 1)
637 count = 1;
638
639 return count;
640 }
641
642 static int g_cpucount = get_cpucount();
643
get_cpu_count()644 int get_cpu_count()
645 {
646 return g_cpucount;
647 }
648
get_little_cpu_count()649 int get_little_cpu_count()
650 {
651 return get_cpu_thread_affinity_mask(1).num_enabled();
652 }
653
get_big_cpu_count()654 int get_big_cpu_count()
655 {
656 int big_cpu_count = get_cpu_thread_affinity_mask(2).num_enabled();
657 return big_cpu_count ? big_cpu_count : g_cpucount;
658 }
659
660 #if defined __ANDROID__ || defined __linux__
get_max_freq_khz(int cpuid)661 static int get_max_freq_khz(int cpuid)
662 {
663 // first try, for all possible cpu
664 char path[256];
665 sprintf(path, "/sys/devices/system/cpu/cpufreq/stats/cpu%d/time_in_state", cpuid);
666
667 FILE* fp = fopen(path, "rb");
668
669 if (!fp)
670 {
671 // second try, for online cpu
672 sprintf(path, "/sys/devices/system/cpu/cpu%d/cpufreq/stats/time_in_state", cpuid);
673 fp = fopen(path, "rb");
674
675 if (fp)
676 {
677 int max_freq_khz = 0;
678 while (!feof(fp))
679 {
680 int freq_khz = 0;
681 int nscan = fscanf(fp, "%d %*d", &freq_khz);
682 if (nscan != 1)
683 break;
684
685 if (freq_khz > max_freq_khz)
686 max_freq_khz = freq_khz;
687 }
688
689 fclose(fp);
690
691 if (max_freq_khz != 0)
692 return max_freq_khz;
693
694 fp = NULL;
695 }
696
697 if (!fp)
698 {
699 // third try, for online cpu
700 sprintf(path, "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq", cpuid);
701 fp = fopen(path, "rb");
702
703 if (!fp)
704 return -1;
705
706 int max_freq_khz = -1;
707 int nscan = fscanf(fp, "%d", &max_freq_khz);
708 if (nscan != 1)
709 {
710 NCNN_LOGE("fscanf cpuinfo_max_freq error %d", nscan);
711 }
712 fclose(fp);
713
714 return max_freq_khz;
715 }
716 }
717
718 int max_freq_khz = 0;
719 while (!feof(fp))
720 {
721 int freq_khz = 0;
722 int nscan = fscanf(fp, "%d %*d", &freq_khz);
723 if (nscan != 1)
724 break;
725
726 if (freq_khz > max_freq_khz)
727 max_freq_khz = freq_khz;
728 }
729
730 fclose(fp);
731
732 return max_freq_khz;
733 }
734
set_sched_affinity(const CpuSet & thread_affinity_mask)735 static int set_sched_affinity(const CpuSet& thread_affinity_mask)
736 {
737 // set affinity for thread
738 #if defined(__BIONIC__)
739 pid_t pid = gettid();
740 #else
741 pid_t pid = syscall(SYS_gettid);
742 #endif
743
744 int syscallret = syscall(__NR_sched_setaffinity, pid, sizeof(cpu_set_t), &thread_affinity_mask.cpu_set);
745 if (syscallret)
746 {
747 NCNN_LOGE("syscall error %d", syscallret);
748 return -1;
749 }
750
751 return 0;
752 }
753 #endif // defined __ANDROID__ || defined __linux__
754
755 #if __APPLE__
set_sched_affinity(const CpuSet & thread_affinity_mask)756 static int set_sched_affinity(const CpuSet& thread_affinity_mask)
757 {
758 // https://developer.apple.com/library/archive/releasenotes/Performance/RN-AffinityAPI/index.html
759 // http://www.hybridkernel.com/2015/01/18/binding_threads_to_cores_osx.html
760 // https://gist.github.com/Coneko/4234842
761
762 // This is a quite outdated document. Apple will not allow developers to set CPU affinity.
763 // In OS X 10.5 it worked, later it became a suggestion to OS X, then in 10.10 or so (as well in later ones), macOS will ignore any affinity settings.
764 // see https://github.com/Tencent/ncnn/pull/2335#discussion_r528233919 --- AmeAkio
765
766 int affinity_tag = THREAD_AFFINITY_TAG_NULL;
767 for (int i = 0; i < (int)sizeof(thread_affinity_mask.policy) * 8; i++)
768 {
769 if (thread_affinity_mask.is_enabled(i))
770 {
771 affinity_tag = i + 1;
772 break;
773 }
774 }
775
776 mach_port_t tid = pthread_mach_thread_np(pthread_self());
777
778 thread_affinity_policy_data_t policy_data;
779 policy_data.affinity_tag = affinity_tag;
780 int ret = thread_policy_set(tid, THREAD_AFFINITY_POLICY, (thread_policy_t)&policy_data, THREAD_AFFINITY_POLICY_COUNT);
781 if (ret && ret != KERN_NOT_SUPPORTED)
782 {
783 NCNN_LOGE("thread_policy_set error %d", ret);
784 return -1;
785 }
786
787 return 0;
788 }
789 #endif // __APPLE__
790
791 static int g_powersave = 0;
792
get_cpu_powersave()793 int get_cpu_powersave()
794 {
795 return g_powersave;
796 }
797
set_cpu_powersave(int powersave)798 int set_cpu_powersave(int powersave)
799 {
800 if (powersave < 0 || powersave > 2)
801 {
802 NCNN_LOGE("powersave %d not supported", powersave);
803 return -1;
804 }
805
806 const CpuSet& thread_affinity_mask = get_cpu_thread_affinity_mask(powersave);
807
808 int ret = set_cpu_thread_affinity(thread_affinity_mask);
809 if (ret != 0)
810 return ret;
811
812 g_powersave = powersave;
813
814 return 0;
815 }
816
817 static CpuSet g_thread_affinity_mask_all;
818 static CpuSet g_thread_affinity_mask_little;
819 static CpuSet g_thread_affinity_mask_big;
820
setup_thread_affinity_masks()821 static int setup_thread_affinity_masks()
822 {
823 g_thread_affinity_mask_all.disable_all();
824
825 #if defined __ANDROID__ || defined __linux__
826 int max_freq_khz_min = INT_MAX;
827 int max_freq_khz_max = 0;
828 std::vector<int> cpu_max_freq_khz(g_cpucount);
829 for (int i = 0; i < g_cpucount; i++)
830 {
831 int max_freq_khz = get_max_freq_khz(i);
832
833 // NCNN_LOGE("%d max freq = %d khz", i, max_freq_khz);
834
835 cpu_max_freq_khz[i] = max_freq_khz;
836
837 if (max_freq_khz > max_freq_khz_max)
838 max_freq_khz_max = max_freq_khz;
839 if (max_freq_khz < max_freq_khz_min)
840 max_freq_khz_min = max_freq_khz;
841 }
842
843 int max_freq_khz_medium = (max_freq_khz_min + max_freq_khz_max) / 2;
844 if (max_freq_khz_medium == max_freq_khz_max)
845 {
846 g_thread_affinity_mask_little.disable_all();
847 g_thread_affinity_mask_big = g_thread_affinity_mask_all;
848 return 0;
849 }
850
851 for (int i = 0; i < g_cpucount; i++)
852 {
853 if (cpu_max_freq_khz[i] < max_freq_khz_medium)
854 g_thread_affinity_mask_little.enable(i);
855 else
856 g_thread_affinity_mask_big.enable(i);
857 }
858 #elif __APPLE__
859 // affinity info from cpu model
860 if (g_hw_cpufamily == CPUFAMILY_ARM_MONSOON_MISTRAL)
861 {
862 // 2 + 4
863 g_thread_affinity_mask_big.enable(0);
864 g_thread_affinity_mask_big.enable(1);
865 g_thread_affinity_mask_little.enable(2);
866 g_thread_affinity_mask_little.enable(3);
867 g_thread_affinity_mask_little.enable(4);
868 g_thread_affinity_mask_little.enable(5);
869 }
870 else if (g_hw_cpufamily == CPUFAMILY_ARM_VORTEX_TEMPEST || g_hw_cpufamily == CPUFAMILY_ARM_LIGHTNING_THUNDER || g_hw_cpufamily == CPUFAMILY_ARM_FIRESTORM_ICESTORM || g_hw_cpufamily == CPUFAMILY_ARM_AVALANCHE_BLIZZARD)
871 {
872 // 2 + 4 or 4 + 4
873 if (get_cpu_count() == 6)
874 {
875 g_thread_affinity_mask_big.enable(0);
876 g_thread_affinity_mask_big.enable(1);
877 g_thread_affinity_mask_little.enable(2);
878 g_thread_affinity_mask_little.enable(3);
879 g_thread_affinity_mask_little.enable(4);
880 g_thread_affinity_mask_little.enable(5);
881 }
882 else
883 {
884 g_thread_affinity_mask_big.enable(0);
885 g_thread_affinity_mask_big.enable(1);
886 g_thread_affinity_mask_big.enable(2);
887 g_thread_affinity_mask_big.enable(3);
888 g_thread_affinity_mask_little.enable(4);
889 g_thread_affinity_mask_little.enable(5);
890 g_thread_affinity_mask_little.enable(6);
891 g_thread_affinity_mask_little.enable(7);
892 }
893 }
894 else
895 {
896 // smp models
897 g_thread_affinity_mask_little.disable_all();
898 g_thread_affinity_mask_big = g_thread_affinity_mask_all;
899 }
900 #else
901 // TODO implement me for other platforms
902 g_thread_affinity_mask_little.disable_all();
903 g_thread_affinity_mask_big = g_thread_affinity_mask_all;
904 #endif
905
906 return 0;
907 }
908
get_cpu_thread_affinity_mask(int powersave)909 const CpuSet& get_cpu_thread_affinity_mask(int powersave)
910 {
911 setup_thread_affinity_masks();
912
913 if (powersave == 0)
914 return g_thread_affinity_mask_all;
915
916 if (powersave == 1)
917 return g_thread_affinity_mask_little;
918
919 if (powersave == 2)
920 return g_thread_affinity_mask_big;
921
922 NCNN_LOGE("powersave %d not supported", powersave);
923
924 // fallback to all cores anyway
925 return g_thread_affinity_mask_all;
926 }
927
set_cpu_thread_affinity(const CpuSet & thread_affinity_mask)928 int set_cpu_thread_affinity(const CpuSet& thread_affinity_mask)
929 {
930 #if defined __ANDROID__ || defined __linux__
931 int num_threads = thread_affinity_mask.num_enabled();
932
933 #ifdef _OPENMP
934 // set affinity for each thread
935 set_omp_num_threads(num_threads);
936 std::vector<int> ssarets(num_threads, 0);
937 #pragma omp parallel for num_threads(num_threads)
938 for (int i = 0; i < num_threads; i++)
939 {
940 ssarets[i] = set_sched_affinity(thread_affinity_mask);
941 }
942 for (int i = 0; i < num_threads; i++)
943 {
944 if (ssarets[i] != 0)
945 return -1;
946 }
947 #else
948 int ssaret = set_sched_affinity(thread_affinity_mask);
949 if (ssaret != 0)
950 return -1;
951 #endif
952
953 return 0;
954 #elif __APPLE__
955
956 #ifdef _OPENMP
957 int num_threads = thread_affinity_mask.num_enabled();
958
959 // set affinity for each thread
960 set_omp_num_threads(num_threads);
961 std::vector<int> ssarets(num_threads, 0);
962 #pragma omp parallel for num_threads(num_threads)
963 for (int i = 0; i < num_threads; i++)
964 {
965 // assign one core for each thread
966 int core = -1 - i;
967 for (int j = 0; j < (int)sizeof(thread_affinity_mask.policy) * 8; j++)
968 {
969 if (thread_affinity_mask.is_enabled(j))
970 {
971 if (core == -1)
972 {
973 core = j;
974 break;
975 }
976 else
977 {
978 core++;
979 }
980 }
981 }
982 CpuSet this_thread_affinity_mask;
983 if (core != -1 - i)
984 {
985 this_thread_affinity_mask.enable(core);
986 }
987
988 ssarets[i] = set_sched_affinity(this_thread_affinity_mask);
989 }
990 for (int i = 0; i < num_threads; i++)
991 {
992 if (ssarets[i] != 0)
993 return -1;
994 }
995 #else
996 int ssaret = set_sched_affinity(thread_affinity_mask);
997 if (ssaret != 0)
998 return -1;
999 #endif
1000
1001 return 0;
1002 #else
1003 // TODO
1004 (void)thread_affinity_mask;
1005 return -1;
1006 #endif
1007 }
1008
get_omp_num_threads()1009 int get_omp_num_threads()
1010 {
1011 #ifdef _OPENMP
1012 return omp_get_num_threads();
1013 #else
1014 return 1;
1015 #endif
1016 }
1017
set_omp_num_threads(int num_threads)1018 void set_omp_num_threads(int num_threads)
1019 {
1020 #ifdef _OPENMP
1021 omp_set_num_threads(num_threads);
1022 #else
1023 (void)num_threads;
1024 #endif
1025 }
1026
get_omp_dynamic()1027 int get_omp_dynamic()
1028 {
1029 #ifdef _OPENMP
1030 return omp_get_dynamic();
1031 #else
1032 return 0;
1033 #endif
1034 }
1035
set_omp_dynamic(int dynamic)1036 void set_omp_dynamic(int dynamic)
1037 {
1038 #ifdef _OPENMP
1039 omp_set_dynamic(dynamic);
1040 #else
1041 (void)dynamic;
1042 #endif
1043 }
1044
get_omp_thread_num()1045 int get_omp_thread_num()
1046 {
1047 #ifdef _OPENMP
1048 return omp_get_thread_num();
1049 #else
1050 return 0;
1051 #endif
1052 }
1053
get_kmp_blocktime()1054 int get_kmp_blocktime()
1055 {
1056 #if defined(_OPENMP) && __clang__
1057 return kmp_get_blocktime();
1058 #else
1059 return 0;
1060 #endif
1061 }
1062
set_kmp_blocktime(int time_ms)1063 void set_kmp_blocktime(int time_ms)
1064 {
1065 #if defined(_OPENMP) && __clang__
1066 kmp_set_blocktime(time_ms);
1067 #else
1068 (void)time_ms;
1069 #endif
1070 }
1071
1072 static ncnn::ThreadLocalStorage tls_flush_denormals;
1073
get_flush_denormals()1074 int get_flush_denormals()
1075 {
1076 #if defined(__SSE3__)
1077 return (int)reinterpret_cast<size_t>(tls_flush_denormals.get());
1078 #else
1079 return 0;
1080 #endif
1081 }
1082
set_flush_denormals(int flush_denormals)1083 int set_flush_denormals(int flush_denormals)
1084 {
1085 if (flush_denormals < 0 || flush_denormals > 3)
1086 {
1087 NCNN_LOGE("denormals_zero %d not supported", flush_denormals);
1088 return -1;
1089 }
1090 #if defined(__SSE3__)
1091 if (flush_denormals == 0)
1092 {
1093 _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_OFF);
1094 _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF);
1095 }
1096 else if (flush_denormals == 1)
1097 {
1098 _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
1099 _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF);
1100 }
1101 else if (flush_denormals == 2)
1102 {
1103 _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_OFF);
1104 _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
1105 }
1106 else if (flush_denormals == 3)
1107 {
1108 _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
1109 _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
1110 }
1111
1112 tls_flush_denormals.set(reinterpret_cast<void*>((size_t)flush_denormals));
1113 return 0;
1114 #else
1115 return 0;
1116 #endif
1117 }
1118
1119 } // namespace ncnn
1120