1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14 
15 #include "cpu.h"
16 
17 #include "platform.h"
18 
19 #include <limits.h>
20 #include <stdio.h>
21 #include <string.h>
22 
23 #ifdef _OPENMP
24 #if NCNN_SIMPLEOMP
25 #include "simpleomp.h"
26 #else
27 #include <omp.h>
28 #endif
29 #endif
30 
31 #ifdef _MSC_VER
32 #include <intrin.h>    // __cpuid()
33 #include <immintrin.h> // _xgetbv()
34 #endif
35 
36 #ifdef __EMSCRIPTEN__
37 #include <emscripten/threading.h>
38 #endif
39 
40 #if defined __ANDROID__ || defined __linux__
41 #if defined __ANDROID__
42 #include <dlfcn.h>
43 #endif
44 #include <stdint.h>
45 #include <sys/syscall.h>
46 #include <unistd.h>
47 #endif
48 
49 #if __APPLE__
50 #include <mach/mach.h>
51 #include <mach/machine.h>
52 #include <mach/thread_act.h>
53 #include <sys/sysctl.h>
54 #include <sys/types.h>
55 #include "TargetConditionals.h"
56 #if TARGET_OS_IPHONE
57 #define __IOS__ 1
58 #endif
59 // define missing cpu model for old sdk
60 #ifndef CPUFAMILY_ARM_HURRICANE
61 #define CPUFAMILY_ARM_HURRICANE 0x67ceee93
62 #endif
63 // A11
64 #ifndef CPUFAMILY_ARM_MONSOON_MISTRAL
65 #define CPUFAMILY_ARM_MONSOON_MISTRAL 0xe81e7ef6
66 #endif
67 // A12
68 #ifndef CPUFAMILY_ARM_VORTEX_TEMPEST
69 #define CPUFAMILY_ARM_VORTEX_TEMPEST 0x07d34b9f
70 #endif
71 // A13
72 #ifndef CPUFAMILY_ARM_LIGHTNING_THUNDER
73 #define CPUFAMILY_ARM_LIGHTNING_THUNDER 0x462504d2
74 #endif
75 // A14
76 #ifndef CPUFAMILY_ARM_FIRESTORM_ICESTORM
77 #define CPUFAMILY_ARM_FIRESTORM_ICESTORM 0x1b588bb3
78 #endif
79 // A15
80 #ifndef CPUFAMILY_ARM_AVALANCHE_BLIZZARD
81 #define CPUFAMILY_ARM_AVALANCHE_BLIZZARD 0xda33d83d
82 #endif
83 // M1
84 #ifndef CPUFAMILY_AARCH64_FIRESTORM_ICESTORM
85 #define CPUFAMILY_AARCH64_FIRESTORM_ICESTORM 0x1b588bb3
86 #endif
87 #endif // __APPLE__
88 
89 #if defined(__SSE3__)
90 #include <immintrin.h>
91 #endif
92 
93 namespace ncnn {
94 
95 #if defined __ANDROID__ || defined __linux__
96 
97 #define AT_HWCAP  16
98 #define AT_HWCAP2 26
99 
100 #if defined __ANDROID__
101 // Probe the system's C library for a 'getauxval' function and call it if
102 // it exits, or return 0 for failure. This function is available since API
103 // level 20.
104 //
105 // This code does *NOT* check for '__ANDROID_API__ >= 20' to support the
106 // edge case where some NDK developers use headers for a platform that is
107 // newer than the one really targetted by their application.
108 // This is typically done to use newer native APIs only when running on more
109 // recent Android versions, and requires careful symbol management.
110 //
111 // Note that getauxval() can't really be re-implemented here, because
112 // its implementation does not parse /proc/self/auxv. Instead it depends
113 // on values  that are passed by the kernel at process-init time to the
114 // C runtime initialization layer.
get_elf_hwcap_from_getauxval()115 static unsigned int get_elf_hwcap_from_getauxval()
116 {
117     typedef unsigned long getauxval_func_t(unsigned long);
118 
119     dlerror();
120     void* libc_handle = dlopen("libc.so", RTLD_NOW);
121     if (!libc_handle)
122     {
123         NCNN_LOGE("dlopen libc.so failed %s", dlerror());
124         return 0;
125     }
126 
127     unsigned int result = 0;
128     getauxval_func_t* func = (getauxval_func_t*)dlsym(libc_handle, "getauxval");
129     if (!func)
130     {
131         NCNN_LOGE("dlsym getauxval failed");
132     }
133     else
134     {
135         // Note: getauxval() returns 0 on failure. Doesn't touch errno.
136         result = (unsigned int)(*func)(AT_HWCAP);
137     }
138     dlclose(libc_handle);
139 
140     return result;
141 }
142 #endif // defined __ANDROID__
143 
144 // extract the ELF HW capabilities bitmap from /proc/self/auxv
get_elf_hwcap_from_proc_self_auxv()145 static unsigned int get_elf_hwcap_from_proc_self_auxv()
146 {
147     FILE* fp = fopen("/proc/self/auxv", "rb");
148     if (!fp)
149     {
150         NCNN_LOGE("fopen /proc/self/auxv failed");
151         return 0;
152     }
153 
154 #if __aarch64__ || __riscv_xlen == 64
155     struct
156     {
157         uint64_t tag;
158         uint64_t value;
159     } entry;
160 #else
161     struct
162     {
163         unsigned int tag;
164         unsigned int value;
165     } entry;
166 
167 #endif
168 
169     unsigned int result = 0;
170     while (!feof(fp))
171     {
172         int nread = fread((char*)&entry, sizeof(entry), 1, fp);
173         if (nread != 1)
174             break;
175 
176         if (entry.tag == 0 && entry.value == 0)
177             break;
178 
179         if (entry.tag == AT_HWCAP)
180         {
181             result = entry.value;
182             break;
183         }
184     }
185 
186     fclose(fp);
187 
188     return result;
189 }
190 
get_elf_hwcap()191 static unsigned int get_elf_hwcap()
192 {
193 #if defined __ANDROID__
194     unsigned int hwcap = get_elf_hwcap_from_getauxval();
195     if (hwcap)
196         return hwcap;
197 #endif
198 
199     return get_elf_hwcap_from_proc_self_auxv();
200 }
201 
202 static unsigned int g_hwcaps = get_elf_hwcap();
203 
204 #if __aarch64__
205 // from arch/arm64/include/uapi/asm/hwcap.h
206 #define HWCAP_ASIMD   (1 << 1)
207 #define HWCAP_ASIMDHP (1 << 10)
208 #define HWCAP_ASIMDDP (1 << 20)
209 #else
210 // from arch/arm/include/uapi/asm/hwcap.h
211 #define HWCAP_NEON  (1 << 12)
212 #define HWCAP_VFPv4 (1 << 16)
213 #endif
214 
215 #if __mips__
216 // from arch/mips/include/uapi/asm/hwcap.h
217 #define HWCAP_MIPS_MSA     (1 << 1)
218 #define HWCAP_LOONGSON_MMI (1 << 11)
219 #endif
220 
221 #if __riscv
222 // from arch/riscv/include/uapi/asm/hwcap.h
223 #define COMPAT_HWCAP_ISA_F (1 << ('F' - 'A'))
224 #define COMPAT_HWCAP_ISA_V (1 << ('V' - 'A'))
225 #endif
226 
227 #endif // defined __ANDROID__ || defined __linux__
228 
229 #if __APPLE__
get_hw_cpufamily()230 static unsigned int get_hw_cpufamily()
231 {
232     unsigned int value = 0;
233     size_t len = sizeof(value);
234     sysctlbyname("hw.cpufamily", &value, &len, NULL, 0);
235     return value;
236 }
237 
get_hw_cputype()238 static cpu_type_t get_hw_cputype()
239 {
240     cpu_type_t value = 0;
241     size_t len = sizeof(value);
242     sysctlbyname("hw.cputype", &value, &len, NULL, 0);
243     return value;
244 }
245 
get_hw_cpusubtype()246 static cpu_subtype_t get_hw_cpusubtype()
247 {
248     cpu_subtype_t value = 0;
249     size_t len = sizeof(value);
250     sysctlbyname("hw.cpusubtype", &value, &len, NULL, 0);
251     return value;
252 }
253 
254 static unsigned int g_hw_cpufamily = get_hw_cpufamily();
255 static cpu_type_t g_hw_cputype = get_hw_cputype();
256 static cpu_subtype_t g_hw_cpusubtype = get_hw_cpusubtype();
257 #endif // __APPLE__
258 
259 #if defined __ANDROID__ || defined __linux__
CpuSet()260 CpuSet::CpuSet()
261 {
262     disable_all();
263 }
264 
enable(int cpu)265 void CpuSet::enable(int cpu)
266 {
267     CPU_SET(cpu, &cpu_set);
268 }
269 
disable(int cpu)270 void CpuSet::disable(int cpu)
271 {
272     CPU_CLR(cpu, &cpu_set);
273 }
274 
disable_all()275 void CpuSet::disable_all()
276 {
277     CPU_ZERO(&cpu_set);
278 }
279 
is_enabled(int cpu) const280 bool CpuSet::is_enabled(int cpu) const
281 {
282     return CPU_ISSET(cpu, &cpu_set);
283 }
284 
num_enabled() const285 int CpuSet::num_enabled() const
286 {
287     int num_enabled = 0;
288     for (int i = 0; i < (int)sizeof(cpu_set_t) * 8; i++)
289     {
290         if (is_enabled(i))
291             num_enabled++;
292     }
293 
294     return num_enabled;
295 }
296 #elif __APPLE__
CpuSet()297 CpuSet::CpuSet()
298 {
299     disable_all();
300 }
301 
enable(int cpu)302 void CpuSet::enable(int cpu)
303 {
304     policy |= (1 << cpu);
305 }
306 
disable(int cpu)307 void CpuSet::disable(int cpu)
308 {
309     policy &= ~(1 << cpu);
310 }
311 
disable_all()312 void CpuSet::disable_all()
313 {
314     policy = 0;
315 }
316 
is_enabled(int cpu) const317 bool CpuSet::is_enabled(int cpu) const
318 {
319     return policy & (1 << cpu);
320 }
321 
num_enabled() const322 int CpuSet::num_enabled() const
323 {
324     int num_enabled = 0;
325     for (int i = 0; i < (int)sizeof(policy) * 8; i++)
326     {
327         if (is_enabled(i))
328             num_enabled++;
329     }
330 
331     return num_enabled;
332 }
333 #else
CpuSet()334 CpuSet::CpuSet()
335 {
336 }
337 
enable(int)338 void CpuSet::enable(int /* cpu */)
339 {
340 }
341 
disable(int)342 void CpuSet::disable(int /* cpu */)
343 {
344 }
345 
disable_all()346 void CpuSet::disable_all()
347 {
348 }
349 
is_enabled(int) const350 bool CpuSet::is_enabled(int /* cpu */) const
351 {
352     return true;
353 }
354 
num_enabled() const355 int CpuSet::num_enabled() const
356 {
357     return get_cpu_count();
358 }
359 #endif
360 
cpu_support_arm_neon()361 int cpu_support_arm_neon()
362 {
363 #if defined __ANDROID__ || defined __linux__
364 #if __aarch64__
365     return g_hwcaps & HWCAP_ASIMD;
366 #else
367     return g_hwcaps & HWCAP_NEON;
368 #endif
369 #elif __APPLE__
370 #if __aarch64__
371     return g_hw_cputype == CPU_TYPE_ARM64;
372 #else
373     return g_hw_cputype == CPU_TYPE_ARM && g_hw_cpusubtype > CPU_SUBTYPE_ARM_V7;
374 #endif
375 #else
376     return 0;
377 #endif
378 }
379 
cpu_support_arm_vfpv4()380 int cpu_support_arm_vfpv4()
381 {
382 #if defined __ANDROID__ || defined __linux__
383 #if __aarch64__
384     // neon always enable fma and fp16
385     return g_hwcaps & HWCAP_ASIMD;
386 #else
387     return g_hwcaps & HWCAP_VFPv4;
388 #endif
389 #elif __APPLE__
390 #if __aarch64__
391     return g_hw_cputype == CPU_TYPE_ARM64;
392 #else
393     return g_hw_cputype == CPU_TYPE_ARM && g_hw_cpusubtype > CPU_SUBTYPE_ARM_V7S;
394 #endif
395 #else
396     return 0;
397 #endif
398 }
399 
cpu_support_arm_asimdhp()400 int cpu_support_arm_asimdhp()
401 {
402 #if defined __ANDROID__ || defined __linux__
403 #if __aarch64__
404     return g_hwcaps & HWCAP_ASIMDHP;
405 #else
406     return 0;
407 #endif
408 #elif __APPLE__
409 #if __aarch64__
410     return g_hw_cpufamily == CPUFAMILY_ARM_MONSOON_MISTRAL || g_hw_cpufamily == CPUFAMILY_ARM_VORTEX_TEMPEST || g_hw_cpufamily == CPUFAMILY_ARM_LIGHTNING_THUNDER || g_hw_cpufamily == CPUFAMILY_ARM_FIRESTORM_ICESTORM || g_hw_cpufamily == CPUFAMILY_ARM_AVALANCHE_BLIZZARD;
411 #else
412     return 0;
413 #endif
414 #else
415     return 0;
416 #endif
417 }
418 
cpu_support_arm_asimddp()419 int cpu_support_arm_asimddp()
420 {
421 #if defined __ANDROID__ || defined __linux__
422 #if __aarch64__
423     return g_hwcaps & HWCAP_ASIMDDP;
424 #else
425     return 0;
426 #endif
427 #elif __APPLE__
428 #if __aarch64__
429     return g_hw_cpufamily == CPUFAMILY_ARM_LIGHTNING_THUNDER || g_hw_cpufamily == CPUFAMILY_ARM_FIRESTORM_ICESTORM || g_hw_cpufamily == CPUFAMILY_ARM_AVALANCHE_BLIZZARD;
430 #else
431     return 0;
432 #endif
433 #else
434     return 0;
435 #endif
436 }
437 
cpu_support_x86_avx2()438 int cpu_support_x86_avx2()
439 {
440 #if !NCNN_AVX2
441     return 0;
442 #endif
443 #if (_M_AMD64 || __x86_64__) || (_M_IX86 || __i386__)
444 #if defined(_MSC_VER)
445     // TODO move to init function
446     int cpu_info[4];
447     __cpuid(cpu_info, 0);
448 
449     int nIds = cpu_info[0];
450     if (nIds < 7)
451         return 0;
452 
453     __cpuid(cpu_info, 1);
454     // check AVX XSAVE OSXSAVE
455     if (!(cpu_info[2] & 0x10000000) || !(cpu_info[2] & 0x04000000) || !(cpu_info[2] & 0x08000000))
456         return 0;
457 
458     // check XSAVE enabled by kernel
459     if ((_xgetbv(0) & 6) != 6)
460         return 0;
461 
462     __cpuid(cpu_info, 7);
463     return cpu_info[1] & 0x00000020;
464 #elif defined(__clang__)
465 #if __clang_major__ >= 6
466     __builtin_cpu_init();
467 #endif
468     return __builtin_cpu_supports("avx2");
469 #elif __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
470     __builtin_cpu_init();
471     return __builtin_cpu_supports("avx2");
472 #else
473     // TODO: other x86 compilers checking avx2 here
474     NCNN_LOGE("AVX2 detection method is unknown for current compiler");
475     return 0;
476 #endif
477 #else
478     return 0;
479 #endif
480 }
481 
cpu_support_x86_avx()482 int cpu_support_x86_avx()
483 {
484 #if !NCNN_AVX
485     return 0;
486 #endif
487 #if (_M_AMD64 || __x86_64__) || (_M_IX86 || __i386__)
488 #if defined(_MSC_VER)
489     // TODO move to init function
490     int cpu_info[4];
491     __cpuid(cpu_info, 0);
492 
493     int nIds = cpu_info[0];
494     if (nIds < 7)
495         return 0;
496 
497     __cpuid(cpu_info, 1);
498     // check AVX XSAVE OSXSAVE
499     if (!(cpu_info[2] & 0x10000000) || !(cpu_info[2] & 0x04000000) || !(cpu_info[2] & 0x08000000))
500         return 0;
501 
502     // check XSAVE enabled by kernel
503     if ((_xgetbv(0) & 6) != 6)
504         return 0;
505     return 1;
506 #elif defined(__clang__)
507 #if __clang_major__ >= 6
508     __builtin_cpu_init();
509 #endif
510     return __builtin_cpu_supports("avx");
511 #elif __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
512     __builtin_cpu_init();
513     return __builtin_cpu_supports("avx");
514 #else
515     // TODO: other x86 compilers checking avx here
516     NCNN_LOGE("AVX detection method is unknown for current compiler");
517     return 0;
518 #endif
519 #else
520     return 0;
521 #endif
522 }
523 
cpu_support_mips_msa()524 int cpu_support_mips_msa()
525 {
526 #if defined __ANDROID__ || defined __linux__
527 #if __mips__
528     return g_hwcaps & HWCAP_MIPS_MSA;
529 #else
530     return 0;
531 #endif
532 #else
533     return 0;
534 #endif
535 }
536 
cpu_support_loongson_mmi()537 int cpu_support_loongson_mmi()
538 {
539 #if defined __ANDROID__ || defined __linux__
540 #if __mips__
541     return g_hwcaps & HWCAP_LOONGSON_MMI;
542 #else
543     return 0;
544 #endif
545 #else
546     return 0;
547 #endif
548 }
549 
cpu_support_riscv_v()550 int cpu_support_riscv_v()
551 {
552 #if defined __ANDROID__ || defined __linux__
553 #if __riscv
554     return g_hwcaps & COMPAT_HWCAP_ISA_V;
555 #else
556     return 0;
557 #endif
558 #else
559     return 0;
560 #endif
561 }
562 
cpu_support_riscv_zfh()563 int cpu_support_riscv_zfh()
564 {
565 #if defined __ANDROID__ || defined __linux__
566 #if __riscv
567     // v + f does not imply zfh, but how to discover zfh properly ?
568     // upstream issue https://github.com/riscv/riscv-isa-manual/issues/414
569     return g_hwcaps & COMPAT_HWCAP_ISA_V && g_hwcaps & COMPAT_HWCAP_ISA_F;
570 #else
571     return 0;
572 #endif
573 #else
574     return 0;
575 #endif
576 }
577 
cpu_riscv_vlenb()578 int cpu_riscv_vlenb()
579 {
580 #if __riscv
581     if (!cpu_support_riscv_v())
582         return 0;
583 
584     int a = 0;
585     asm volatile(
586         ".word  0xc22026f3  \n" // csrr  a3, vlenb
587         "mv     %0, a3      \n"
588         : "=r"(a)
589         :
590         : "memory", "a3");
591     return a;
592 #else
593     return 0;
594 #endif
595 }
596 
get_cpucount()597 static int get_cpucount()
598 {
599     int count = 0;
600 #ifdef __EMSCRIPTEN__
601     if (emscripten_has_threading_support())
602         count = emscripten_num_logical_cores();
603     else
604         count = 1;
605 #elif defined __ANDROID__ || defined __linux__
606     // get cpu count from /proc/cpuinfo
607     FILE* fp = fopen("/proc/cpuinfo", "rb");
608     if (!fp)
609         return 1;
610 
611     char line[1024];
612     while (!feof(fp))
613     {
614         char* s = fgets(line, 1024, fp);
615         if (!s)
616             break;
617 
618         if (memcmp(line, "processor", 9) == 0)
619         {
620             count++;
621         }
622     }
623 
624     fclose(fp);
625 #elif __APPLE__
626     size_t len = sizeof(count);
627     sysctlbyname("hw.ncpu", &count, &len, NULL, 0);
628 #else
629 #ifdef _OPENMP
630     count = omp_get_max_threads();
631 #else
632     count = 1;
633 #endif // _OPENMP
634 #endif
635 
636     if (count < 1)
637         count = 1;
638 
639     return count;
640 }
641 
642 static int g_cpucount = get_cpucount();
643 
get_cpu_count()644 int get_cpu_count()
645 {
646     return g_cpucount;
647 }
648 
get_little_cpu_count()649 int get_little_cpu_count()
650 {
651     return get_cpu_thread_affinity_mask(1).num_enabled();
652 }
653 
get_big_cpu_count()654 int get_big_cpu_count()
655 {
656     int big_cpu_count = get_cpu_thread_affinity_mask(2).num_enabled();
657     return big_cpu_count ? big_cpu_count : g_cpucount;
658 }
659 
660 #if defined __ANDROID__ || defined __linux__
get_max_freq_khz(int cpuid)661 static int get_max_freq_khz(int cpuid)
662 {
663     // first try, for all possible cpu
664     char path[256];
665     sprintf(path, "/sys/devices/system/cpu/cpufreq/stats/cpu%d/time_in_state", cpuid);
666 
667     FILE* fp = fopen(path, "rb");
668 
669     if (!fp)
670     {
671         // second try, for online cpu
672         sprintf(path, "/sys/devices/system/cpu/cpu%d/cpufreq/stats/time_in_state", cpuid);
673         fp = fopen(path, "rb");
674 
675         if (fp)
676         {
677             int max_freq_khz = 0;
678             while (!feof(fp))
679             {
680                 int freq_khz = 0;
681                 int nscan = fscanf(fp, "%d %*d", &freq_khz);
682                 if (nscan != 1)
683                     break;
684 
685                 if (freq_khz > max_freq_khz)
686                     max_freq_khz = freq_khz;
687             }
688 
689             fclose(fp);
690 
691             if (max_freq_khz != 0)
692                 return max_freq_khz;
693 
694             fp = NULL;
695         }
696 
697         if (!fp)
698         {
699             // third try, for online cpu
700             sprintf(path, "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq", cpuid);
701             fp = fopen(path, "rb");
702 
703             if (!fp)
704                 return -1;
705 
706             int max_freq_khz = -1;
707             int nscan = fscanf(fp, "%d", &max_freq_khz);
708             if (nscan != 1)
709             {
710                 NCNN_LOGE("fscanf cpuinfo_max_freq error %d", nscan);
711             }
712             fclose(fp);
713 
714             return max_freq_khz;
715         }
716     }
717 
718     int max_freq_khz = 0;
719     while (!feof(fp))
720     {
721         int freq_khz = 0;
722         int nscan = fscanf(fp, "%d %*d", &freq_khz);
723         if (nscan != 1)
724             break;
725 
726         if (freq_khz > max_freq_khz)
727             max_freq_khz = freq_khz;
728     }
729 
730     fclose(fp);
731 
732     return max_freq_khz;
733 }
734 
set_sched_affinity(const CpuSet & thread_affinity_mask)735 static int set_sched_affinity(const CpuSet& thread_affinity_mask)
736 {
737     // set affinity for thread
738 #if defined(__BIONIC__)
739     pid_t pid = gettid();
740 #else
741     pid_t pid = syscall(SYS_gettid);
742 #endif
743 
744     int syscallret = syscall(__NR_sched_setaffinity, pid, sizeof(cpu_set_t), &thread_affinity_mask.cpu_set);
745     if (syscallret)
746     {
747         NCNN_LOGE("syscall error %d", syscallret);
748         return -1;
749     }
750 
751     return 0;
752 }
753 #endif // defined __ANDROID__ || defined __linux__
754 
755 #if __APPLE__
set_sched_affinity(const CpuSet & thread_affinity_mask)756 static int set_sched_affinity(const CpuSet& thread_affinity_mask)
757 {
758     // https://developer.apple.com/library/archive/releasenotes/Performance/RN-AffinityAPI/index.html
759     // http://www.hybridkernel.com/2015/01/18/binding_threads_to_cores_osx.html
760     // https://gist.github.com/Coneko/4234842
761 
762     // This is a quite outdated document. Apple will not allow developers to set CPU affinity.
763     // In OS X 10.5 it worked, later it became a suggestion to OS X, then in 10.10 or so (as well in later ones), macOS will ignore any affinity settings.
764     // see https://github.com/Tencent/ncnn/pull/2335#discussion_r528233919   --- AmeAkio
765 
766     int affinity_tag = THREAD_AFFINITY_TAG_NULL;
767     for (int i = 0; i < (int)sizeof(thread_affinity_mask.policy) * 8; i++)
768     {
769         if (thread_affinity_mask.is_enabled(i))
770         {
771             affinity_tag = i + 1;
772             break;
773         }
774     }
775 
776     mach_port_t tid = pthread_mach_thread_np(pthread_self());
777 
778     thread_affinity_policy_data_t policy_data;
779     policy_data.affinity_tag = affinity_tag;
780     int ret = thread_policy_set(tid, THREAD_AFFINITY_POLICY, (thread_policy_t)&policy_data, THREAD_AFFINITY_POLICY_COUNT);
781     if (ret && ret != KERN_NOT_SUPPORTED)
782     {
783         NCNN_LOGE("thread_policy_set error %d", ret);
784         return -1;
785     }
786 
787     return 0;
788 }
789 #endif // __APPLE__
790 
791 static int g_powersave = 0;
792 
get_cpu_powersave()793 int get_cpu_powersave()
794 {
795     return g_powersave;
796 }
797 
set_cpu_powersave(int powersave)798 int set_cpu_powersave(int powersave)
799 {
800     if (powersave < 0 || powersave > 2)
801     {
802         NCNN_LOGE("powersave %d not supported", powersave);
803         return -1;
804     }
805 
806     const CpuSet& thread_affinity_mask = get_cpu_thread_affinity_mask(powersave);
807 
808     int ret = set_cpu_thread_affinity(thread_affinity_mask);
809     if (ret != 0)
810         return ret;
811 
812     g_powersave = powersave;
813 
814     return 0;
815 }
816 
817 static CpuSet g_thread_affinity_mask_all;
818 static CpuSet g_thread_affinity_mask_little;
819 static CpuSet g_thread_affinity_mask_big;
820 
setup_thread_affinity_masks()821 static int setup_thread_affinity_masks()
822 {
823     g_thread_affinity_mask_all.disable_all();
824 
825 #if defined __ANDROID__ || defined __linux__
826     int max_freq_khz_min = INT_MAX;
827     int max_freq_khz_max = 0;
828     std::vector<int> cpu_max_freq_khz(g_cpucount);
829     for (int i = 0; i < g_cpucount; i++)
830     {
831         int max_freq_khz = get_max_freq_khz(i);
832 
833         //         NCNN_LOGE("%d max freq = %d khz", i, max_freq_khz);
834 
835         cpu_max_freq_khz[i] = max_freq_khz;
836 
837         if (max_freq_khz > max_freq_khz_max)
838             max_freq_khz_max = max_freq_khz;
839         if (max_freq_khz < max_freq_khz_min)
840             max_freq_khz_min = max_freq_khz;
841     }
842 
843     int max_freq_khz_medium = (max_freq_khz_min + max_freq_khz_max) / 2;
844     if (max_freq_khz_medium == max_freq_khz_max)
845     {
846         g_thread_affinity_mask_little.disable_all();
847         g_thread_affinity_mask_big = g_thread_affinity_mask_all;
848         return 0;
849     }
850 
851     for (int i = 0; i < g_cpucount; i++)
852     {
853         if (cpu_max_freq_khz[i] < max_freq_khz_medium)
854             g_thread_affinity_mask_little.enable(i);
855         else
856             g_thread_affinity_mask_big.enable(i);
857     }
858 #elif __APPLE__
859     // affinity info from cpu model
860     if (g_hw_cpufamily == CPUFAMILY_ARM_MONSOON_MISTRAL)
861     {
862         // 2 + 4
863         g_thread_affinity_mask_big.enable(0);
864         g_thread_affinity_mask_big.enable(1);
865         g_thread_affinity_mask_little.enable(2);
866         g_thread_affinity_mask_little.enable(3);
867         g_thread_affinity_mask_little.enable(4);
868         g_thread_affinity_mask_little.enable(5);
869     }
870     else if (g_hw_cpufamily == CPUFAMILY_ARM_VORTEX_TEMPEST || g_hw_cpufamily == CPUFAMILY_ARM_LIGHTNING_THUNDER || g_hw_cpufamily == CPUFAMILY_ARM_FIRESTORM_ICESTORM || g_hw_cpufamily == CPUFAMILY_ARM_AVALANCHE_BLIZZARD)
871     {
872         // 2 + 4 or 4 + 4
873         if (get_cpu_count() == 6)
874         {
875             g_thread_affinity_mask_big.enable(0);
876             g_thread_affinity_mask_big.enable(1);
877             g_thread_affinity_mask_little.enable(2);
878             g_thread_affinity_mask_little.enable(3);
879             g_thread_affinity_mask_little.enable(4);
880             g_thread_affinity_mask_little.enable(5);
881         }
882         else
883         {
884             g_thread_affinity_mask_big.enable(0);
885             g_thread_affinity_mask_big.enable(1);
886             g_thread_affinity_mask_big.enable(2);
887             g_thread_affinity_mask_big.enable(3);
888             g_thread_affinity_mask_little.enable(4);
889             g_thread_affinity_mask_little.enable(5);
890             g_thread_affinity_mask_little.enable(6);
891             g_thread_affinity_mask_little.enable(7);
892         }
893     }
894     else
895     {
896         // smp models
897         g_thread_affinity_mask_little.disable_all();
898         g_thread_affinity_mask_big = g_thread_affinity_mask_all;
899     }
900 #else
901     // TODO implement me for other platforms
902     g_thread_affinity_mask_little.disable_all();
903     g_thread_affinity_mask_big = g_thread_affinity_mask_all;
904 #endif
905 
906     return 0;
907 }
908 
get_cpu_thread_affinity_mask(int powersave)909 const CpuSet& get_cpu_thread_affinity_mask(int powersave)
910 {
911     setup_thread_affinity_masks();
912 
913     if (powersave == 0)
914         return g_thread_affinity_mask_all;
915 
916     if (powersave == 1)
917         return g_thread_affinity_mask_little;
918 
919     if (powersave == 2)
920         return g_thread_affinity_mask_big;
921 
922     NCNN_LOGE("powersave %d not supported", powersave);
923 
924     // fallback to all cores anyway
925     return g_thread_affinity_mask_all;
926 }
927 
set_cpu_thread_affinity(const CpuSet & thread_affinity_mask)928 int set_cpu_thread_affinity(const CpuSet& thread_affinity_mask)
929 {
930 #if defined __ANDROID__ || defined __linux__
931     int num_threads = thread_affinity_mask.num_enabled();
932 
933 #ifdef _OPENMP
934     // set affinity for each thread
935     set_omp_num_threads(num_threads);
936     std::vector<int> ssarets(num_threads, 0);
937     #pragma omp parallel for num_threads(num_threads)
938     for (int i = 0; i < num_threads; i++)
939     {
940         ssarets[i] = set_sched_affinity(thread_affinity_mask);
941     }
942     for (int i = 0; i < num_threads; i++)
943     {
944         if (ssarets[i] != 0)
945             return -1;
946     }
947 #else
948     int ssaret = set_sched_affinity(thread_affinity_mask);
949     if (ssaret != 0)
950         return -1;
951 #endif
952 
953     return 0;
954 #elif __APPLE__
955 
956 #ifdef _OPENMP
957     int num_threads = thread_affinity_mask.num_enabled();
958 
959     // set affinity for each thread
960     set_omp_num_threads(num_threads);
961     std::vector<int> ssarets(num_threads, 0);
962     #pragma omp parallel for num_threads(num_threads)
963     for (int i = 0; i < num_threads; i++)
964     {
965         // assign one core for each thread
966         int core = -1 - i;
967         for (int j = 0; j < (int)sizeof(thread_affinity_mask.policy) * 8; j++)
968         {
969             if (thread_affinity_mask.is_enabled(j))
970             {
971                 if (core == -1)
972                 {
973                     core = j;
974                     break;
975                 }
976                 else
977                 {
978                     core++;
979                 }
980             }
981         }
982         CpuSet this_thread_affinity_mask;
983         if (core != -1 - i)
984         {
985             this_thread_affinity_mask.enable(core);
986         }
987 
988         ssarets[i] = set_sched_affinity(this_thread_affinity_mask);
989     }
990     for (int i = 0; i < num_threads; i++)
991     {
992         if (ssarets[i] != 0)
993             return -1;
994     }
995 #else
996     int ssaret = set_sched_affinity(thread_affinity_mask);
997     if (ssaret != 0)
998         return -1;
999 #endif
1000 
1001     return 0;
1002 #else
1003     // TODO
1004     (void)thread_affinity_mask;
1005     return -1;
1006 #endif
1007 }
1008 
get_omp_num_threads()1009 int get_omp_num_threads()
1010 {
1011 #ifdef _OPENMP
1012     return omp_get_num_threads();
1013 #else
1014     return 1;
1015 #endif
1016 }
1017 
set_omp_num_threads(int num_threads)1018 void set_omp_num_threads(int num_threads)
1019 {
1020 #ifdef _OPENMP
1021     omp_set_num_threads(num_threads);
1022 #else
1023     (void)num_threads;
1024 #endif
1025 }
1026 
get_omp_dynamic()1027 int get_omp_dynamic()
1028 {
1029 #ifdef _OPENMP
1030     return omp_get_dynamic();
1031 #else
1032     return 0;
1033 #endif
1034 }
1035 
set_omp_dynamic(int dynamic)1036 void set_omp_dynamic(int dynamic)
1037 {
1038 #ifdef _OPENMP
1039     omp_set_dynamic(dynamic);
1040 #else
1041     (void)dynamic;
1042 #endif
1043 }
1044 
get_omp_thread_num()1045 int get_omp_thread_num()
1046 {
1047 #ifdef _OPENMP
1048     return omp_get_thread_num();
1049 #else
1050     return 0;
1051 #endif
1052 }
1053 
get_kmp_blocktime()1054 int get_kmp_blocktime()
1055 {
1056 #if defined(_OPENMP) && __clang__
1057     return kmp_get_blocktime();
1058 #else
1059     return 0;
1060 #endif
1061 }
1062 
set_kmp_blocktime(int time_ms)1063 void set_kmp_blocktime(int time_ms)
1064 {
1065 #if defined(_OPENMP) && __clang__
1066     kmp_set_blocktime(time_ms);
1067 #else
1068     (void)time_ms;
1069 #endif
1070 }
1071 
1072 static ncnn::ThreadLocalStorage tls_flush_denormals;
1073 
get_flush_denormals()1074 int get_flush_denormals()
1075 {
1076 #if defined(__SSE3__)
1077     return (int)reinterpret_cast<size_t>(tls_flush_denormals.get());
1078 #else
1079     return 0;
1080 #endif
1081 }
1082 
set_flush_denormals(int flush_denormals)1083 int set_flush_denormals(int flush_denormals)
1084 {
1085     if (flush_denormals < 0 || flush_denormals > 3)
1086     {
1087         NCNN_LOGE("denormals_zero %d not supported", flush_denormals);
1088         return -1;
1089     }
1090 #if defined(__SSE3__)
1091     if (flush_denormals == 0)
1092     {
1093         _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_OFF);
1094         _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF);
1095     }
1096     else if (flush_denormals == 1)
1097     {
1098         _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
1099         _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF);
1100     }
1101     else if (flush_denormals == 2)
1102     {
1103         _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_OFF);
1104         _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
1105     }
1106     else if (flush_denormals == 3)
1107     {
1108         _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
1109         _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
1110     }
1111 
1112     tls_flush_denormals.set(reinterpret_cast<void*>((size_t)flush_denormals));
1113     return 0;
1114 #else
1115     return 0;
1116 #endif
1117 }
1118 
1119 } // namespace ncnn
1120