1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14 
15 #include "cpu.h"
16 
17 #include "platform.h"
18 
19 #include <limits.h>
20 #include <stdio.h>
21 #include <string.h>
22 
23 #ifdef _OPENMP
24 #if NCNN_SIMPLEOMP
25 #include "simpleomp.h"
26 #else
27 #include <omp.h>
28 #endif
29 #endif
30 
31 #ifdef _MSC_VER
32 #include <intrin.h>    // __cpuid()
33 #include <immintrin.h> // _xgetbv()
34 #endif
35 
36 #ifdef __EMSCRIPTEN__
37 #include <emscripten/threading.h>
38 #endif
39 
40 #if defined __ANDROID__ || defined __linux__
41 #include <stdint.h>
42 #include <sys/syscall.h>
43 #include <unistd.h>
44 #endif
45 
46 #if __APPLE__
47 #include <mach/mach.h>
48 #include <mach/machine.h>
49 #include <mach/thread_act.h>
50 #include <sys/sysctl.h>
51 #include <sys/types.h>
52 #include "TargetConditionals.h"
53 #if TARGET_OS_IPHONE
54 #define __IOS__ 1
55 #endif
56 // define missing cpu model for old sdk
57 #ifndef CPUFAMILY_ARM_HURRICANE
58 #define CPUFAMILY_ARM_HURRICANE 0x67ceee93
59 #endif
60 #ifndef CPUFAMILY_ARM_MONSOON_MISTRAL
61 #define CPUFAMILY_ARM_MONSOON_MISTRAL 0xe81e7ef6
62 #endif
63 #ifndef CPUFAMILY_ARM_VORTEX_TEMPEST
64 #define CPUFAMILY_ARM_VORTEX_TEMPEST 0x07d34b9f
65 #endif
66 #ifndef CPUFAMILY_ARM_LIGHTNING_THUNDER
67 #define CPUFAMILY_ARM_LIGHTNING_THUNDER 0x462504d2
68 #endif
69 #ifndef CPUFAMILY_ARM_FIRESTORM_ICESTORM
70 #define CPUFAMILY_ARM_FIRESTORM_ICESTORM 0x1b588bb3
71 #endif
72 #endif
73 
74 #if defined(__SSE3__)
75 #include <immintrin.h>
76 #endif
77 
78 namespace ncnn {
79 
80 #if defined __ANDROID__ || defined __linux__
81 
82 // extract the ELF HW capabilities bitmap from /proc/self/auxv
get_elf_hwcap_from_proc_self_auxv()83 static unsigned int get_elf_hwcap_from_proc_self_auxv()
84 {
85     FILE* fp = fopen("/proc/self/auxv", "rb");
86     if (!fp)
87     {
88         return 0;
89     }
90 
91 #define AT_HWCAP  16
92 #define AT_HWCAP2 26
93 #if __aarch64__ || __riscv_xlen == 64
94     struct
95     {
96         uint64_t tag;
97         uint64_t value;
98     } entry;
99 #else
100     struct
101     {
102         unsigned int tag;
103         unsigned int value;
104     } entry;
105 
106 #endif
107 
108     unsigned int result = 0;
109     while (!feof(fp))
110     {
111         int nread = fread((char*)&entry, sizeof(entry), 1, fp);
112         if (nread != 1)
113             break;
114 
115         if (entry.tag == 0 && entry.value == 0)
116             break;
117 
118         if (entry.tag == AT_HWCAP)
119         {
120             result = entry.value;
121             break;
122         }
123     }
124 
125     fclose(fp);
126 
127     return result;
128 }
129 
130 static unsigned int g_hwcaps = get_elf_hwcap_from_proc_self_auxv();
131 
132 #if __aarch64__
133 // from arch/arm64/include/uapi/asm/hwcap.h
134 #define HWCAP_ASIMD   (1 << 1)
135 #define HWCAP_ASIMDHP (1 << 10)
136 #define HWCAP_ASIMDDP (1 << 20)
137 #else
138 // from arch/arm/include/uapi/asm/hwcap.h
139 #define HWCAP_NEON  (1 << 12)
140 #define HWCAP_VFPv4 (1 << 16)
141 #endif
142 
143 #if __riscv
144 // from arch/riscv/include/uapi/asm/hwcap.h
145 #define COMPAT_HWCAP_ISA_V (1 << ('V' - 'A'))
146 #endif
147 
148 #endif // defined __ANDROID__ || defined __linux__
149 
150 #if __APPLE__
get_hw_cpufamily()151 static unsigned int get_hw_cpufamily()
152 {
153     unsigned int value = 0;
154     size_t len = sizeof(value);
155     sysctlbyname("hw.cpufamily", &value, &len, NULL, 0);
156     return value;
157 }
158 
get_hw_cputype()159 static cpu_type_t get_hw_cputype()
160 {
161     cpu_type_t value = 0;
162     size_t len = sizeof(value);
163     sysctlbyname("hw.cputype", &value, &len, NULL, 0);
164     return value;
165 }
166 
get_hw_cpusubtype()167 static cpu_subtype_t get_hw_cpusubtype()
168 {
169     cpu_subtype_t value = 0;
170     size_t len = sizeof(value);
171     sysctlbyname("hw.cpusubtype", &value, &len, NULL, 0);
172     return value;
173 }
174 
175 static unsigned int g_hw_cpufamily = get_hw_cpufamily();
176 static cpu_type_t g_hw_cputype = get_hw_cputype();
177 static cpu_subtype_t g_hw_cpusubtype = get_hw_cpusubtype();
178 #endif // __APPLE__
179 
180 #if defined __ANDROID__ || defined __linux__
CpuSet()181 CpuSet::CpuSet()
182 {
183     disable_all();
184 }
185 
enable(int cpu)186 void CpuSet::enable(int cpu)
187 {
188     CPU_SET(cpu, &cpu_set);
189 }
190 
disable(int cpu)191 void CpuSet::disable(int cpu)
192 {
193     CPU_CLR(cpu, &cpu_set);
194 }
195 
disable_all()196 void CpuSet::disable_all()
197 {
198     CPU_ZERO(&cpu_set);
199 }
200 
is_enabled(int cpu) const201 bool CpuSet::is_enabled(int cpu) const
202 {
203     return CPU_ISSET(cpu, &cpu_set);
204 }
205 
num_enabled() const206 int CpuSet::num_enabled() const
207 {
208     int num_enabled = 0;
209     for (int i = 0; i < (int)sizeof(cpu_set_t) * 8; i++)
210     {
211         if (is_enabled(i))
212             num_enabled++;
213     }
214 
215     return num_enabled;
216 }
217 #elif __APPLE__
CpuSet()218 CpuSet::CpuSet()
219 {
220     disable_all();
221 }
222 
enable(int cpu)223 void CpuSet::enable(int cpu)
224 {
225     policy |= (1 << cpu);
226 }
227 
disable(int cpu)228 void CpuSet::disable(int cpu)
229 {
230     policy &= ~(1 << cpu);
231 }
232 
disable_all()233 void CpuSet::disable_all()
234 {
235     policy = 0;
236 }
237 
is_enabled(int cpu) const238 bool CpuSet::is_enabled(int cpu) const
239 {
240     return policy & (1 << cpu);
241 }
242 
num_enabled() const243 int CpuSet::num_enabled() const
244 {
245     int num_enabled = 0;
246     for (int i = 0; i < (int)sizeof(policy) * 8; i++)
247     {
248         if (is_enabled(i))
249             num_enabled++;
250     }
251 
252     return num_enabled;
253 }
254 #else
CpuSet()255 CpuSet::CpuSet()
256 {
257 }
258 
enable(int)259 void CpuSet::enable(int /* cpu */)
260 {
261 }
262 
disable(int)263 void CpuSet::disable(int /* cpu */)
264 {
265 }
266 
disable_all()267 void CpuSet::disable_all()
268 {
269 }
270 
is_enabled(int) const271 bool CpuSet::is_enabled(int /* cpu */) const
272 {
273     return true;
274 }
275 
num_enabled() const276 int CpuSet::num_enabled() const
277 {
278     return get_cpu_count();
279 }
280 #endif
281 
cpu_support_arm_neon()282 int cpu_support_arm_neon()
283 {
284 #if defined __ANDROID__ || defined __linux__
285 #if __aarch64__
286     return g_hwcaps & HWCAP_ASIMD;
287 #else
288     return g_hwcaps & HWCAP_NEON;
289 #endif
290 #elif __APPLE__
291 #if __aarch64__
292     return g_hw_cputype == CPU_TYPE_ARM64;
293 #else
294     return g_hw_cputype == CPU_TYPE_ARM && g_hw_cpusubtype > CPU_SUBTYPE_ARM_V7;
295 #endif
296 #else
297     return 0;
298 #endif
299 }
300 
cpu_support_arm_vfpv4()301 int cpu_support_arm_vfpv4()
302 {
303 #if defined __ANDROID__ || defined __linux__
304 #if __aarch64__
305     // neon always enable fma and fp16
306     return g_hwcaps & HWCAP_ASIMD;
307 #else
308     return g_hwcaps & HWCAP_VFPv4;
309 #endif
310 #elif __APPLE__
311 #if __aarch64__
312     return g_hw_cputype == CPU_TYPE_ARM64;
313 #else
314     return g_hw_cputype == CPU_TYPE_ARM && g_hw_cpusubtype > CPU_SUBTYPE_ARM_V7S;
315 #endif
316 #else
317     return 0;
318 #endif
319 }
320 
cpu_support_arm_asimdhp()321 int cpu_support_arm_asimdhp()
322 {
323 #if defined __ANDROID__ || defined __linux__
324 #if __aarch64__
325     return g_hwcaps & HWCAP_ASIMDHP;
326 #else
327     return 0;
328 #endif
329 #elif __APPLE__
330 #if __aarch64__
331     return g_hw_cpufamily == CPUFAMILY_ARM_MONSOON_MISTRAL || g_hw_cpufamily == CPUFAMILY_ARM_VORTEX_TEMPEST || g_hw_cpufamily == CPUFAMILY_ARM_LIGHTNING_THUNDER || g_hw_cpufamily == CPUFAMILY_ARM_FIRESTORM_ICESTORM;
332 #else
333     return 0;
334 #endif
335 #else
336     return 0;
337 #endif
338 }
339 
cpu_support_arm_asimddp()340 int cpu_support_arm_asimddp()
341 {
342 #if defined __ANDROID__ || defined __linux__
343 #if __aarch64__
344     return g_hwcaps & HWCAP_ASIMDDP;
345 #else
346     return 0;
347 #endif
348 #elif __APPLE__
349 #if __aarch64__
350     return g_hw_cpufamily == CPUFAMILY_ARM_MONSOON_MISTRAL || g_hw_cpufamily == CPUFAMILY_ARM_VORTEX_TEMPEST || g_hw_cpufamily == CPUFAMILY_ARM_LIGHTNING_THUNDER || g_hw_cpufamily == CPUFAMILY_ARM_FIRESTORM_ICESTORM;
351 #else
352     return 0;
353 #endif
354 #else
355     return 0;
356 #endif
357 }
358 
cpu_support_x86_avx2()359 int cpu_support_x86_avx2()
360 {
361 #if (_M_AMD64 || __x86_64__) || (_M_IX86 || __i386__)
362 #if defined(_MSC_VER)
363     // TODO move to init function
364     int cpu_info[4];
365     __cpuid(cpu_info, 0);
366 
367     int nIds = cpu_info[0];
368     if (nIds < 7)
369         return 0;
370 
371     __cpuid(cpu_info, 1);
372     // check AVX XSAVE OSXSAVE
373     if (!(cpu_info[2] & 0x10000000) || !(cpu_info[2] & 0x04000000) || !(cpu_info[2] & 0x08000000))
374         return 0;
375 
376     // check XSAVE enabled by kernel
377     if ((_xgetbv(0) & 6) != 6)
378         return 0;
379 
380     __cpuid(cpu_info, 7);
381     return cpu_info[1] & 0x00000020;
382 #elif defined(__clang__)
383 #if __clang_major__ >= 6
384     __builtin_cpu_init();
385 #endif
386     return __builtin_cpu_supports("avx2");
387 #elif __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
388     __builtin_cpu_init();
389     return __builtin_cpu_supports("avx2");
390 #else
391     // TODO: other x86 compilers checking avx2 here
392     NCNN_LOGE("AVX2 detection method is unknown for current compiler");
393     return 0;
394 #endif
395 #else
396     return 0;
397 #endif
398 }
399 
cpu_support_riscv_v()400 int cpu_support_riscv_v()
401 {
402 #if defined __ANDROID__ || defined __linux__
403 #if __riscv
404     return g_hwcaps & COMPAT_HWCAP_ISA_V;
405 #else
406     return 0;
407 #endif
408 #else
409     return 0;
410 #endif
411 }
412 
cpu_support_riscv_zfh()413 int cpu_support_riscv_zfh()
414 {
415 #if __riscv
416 #if __riscv_zfh
417     // https://github.com/riscv/riscv-zfinx/blob/master/Zfinx_spec.adoc#5-discovery
418     __fp16 a = 0;
419     asm volatile(
420         "fneg.h     %0, %0  \n"
421         : "=f"(a)
422         : "0"(a)
423         :);
424     union
425     {
426         __fp16 a;
427         unsigned short u;
428     } tmp;
429     tmp.a = a;
430     return tmp.u != 0 ? 1 : 0;
431 #else
432     return 0;
433 #endif
434 #else
435     return 0;
436 #endif
437 }
438 
cpu_riscv_vlenb()439 int cpu_riscv_vlenb()
440 {
441 #if __riscv
442     if (!cpu_support_riscv_v())
443         return 0;
444 
445     int a = 0;
446     asm volatile(
447         ".word  0xc22026f3  \n" // csrr  a3, vlenb
448         "mv     %0, a3      \n"
449         : "=r"(a)
450         :
451         : "memory", "a3");
452     return a;
453 #else
454     return 0;
455 #endif
456 }
457 
get_cpucount()458 static int get_cpucount()
459 {
460     int count = 0;
461 #ifdef __EMSCRIPTEN__
462     if (emscripten_has_threading_support())
463         count = emscripten_num_logical_cores();
464     else
465         count = 1;
466 #elif defined __ANDROID__ || defined __linux__
467     // get cpu count from /proc/cpuinfo
468     FILE* fp = fopen("/proc/cpuinfo", "rb");
469     if (!fp)
470         return 1;
471 
472     char line[1024];
473     while (!feof(fp))
474     {
475         char* s = fgets(line, 1024, fp);
476         if (!s)
477             break;
478 
479         if (memcmp(line, "processor", 9) == 0)
480         {
481             count++;
482         }
483     }
484 
485     fclose(fp);
486 #elif __APPLE__
487     size_t len = sizeof(count);
488     sysctlbyname("hw.ncpu", &count, &len, NULL, 0);
489 #else
490 #ifdef _OPENMP
491     count = omp_get_max_threads();
492 #else
493     count = 1;
494 #endif // _OPENMP
495 #endif
496 
497     if (count < 1)
498         count = 1;
499 
500     return count;
501 }
502 
503 static int g_cpucount = get_cpucount();
504 
get_cpu_count()505 int get_cpu_count()
506 {
507     return g_cpucount;
508 }
509 
get_little_cpu_count()510 int get_little_cpu_count()
511 {
512     return get_cpu_thread_affinity_mask(1).num_enabled();
513 }
514 
get_big_cpu_count()515 int get_big_cpu_count()
516 {
517     int big_cpu_count = get_cpu_thread_affinity_mask(2).num_enabled();
518     return big_cpu_count ? big_cpu_count : g_cpucount;
519 }
520 
521 #if defined __ANDROID__ || defined __linux__
get_max_freq_khz(int cpuid)522 static int get_max_freq_khz(int cpuid)
523 {
524     // first try, for all possible cpu
525     char path[256];
526     sprintf(path, "/sys/devices/system/cpu/cpufreq/stats/cpu%d/time_in_state", cpuid);
527 
528     FILE* fp = fopen(path, "rb");
529 
530     if (!fp)
531     {
532         // second try, for online cpu
533         sprintf(path, "/sys/devices/system/cpu/cpu%d/cpufreq/stats/time_in_state", cpuid);
534         fp = fopen(path, "rb");
535 
536         if (fp)
537         {
538             int max_freq_khz = 0;
539             while (!feof(fp))
540             {
541                 int freq_khz = 0;
542                 int nscan = fscanf(fp, "%d %*d", &freq_khz);
543                 if (nscan != 1)
544                     break;
545 
546                 if (freq_khz > max_freq_khz)
547                     max_freq_khz = freq_khz;
548             }
549 
550             fclose(fp);
551 
552             if (max_freq_khz != 0)
553                 return max_freq_khz;
554 
555             fp = NULL;
556         }
557 
558         if (!fp)
559         {
560             // third try, for online cpu
561             sprintf(path, "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq", cpuid);
562             fp = fopen(path, "rb");
563 
564             if (!fp)
565                 return -1;
566 
567             int max_freq_khz = -1;
568             int nscan = fscanf(fp, "%d", &max_freq_khz);
569             if (nscan != 1)
570             {
571                 NCNN_LOGE("fscanf cpuinfo_max_freq error %d", nscan);
572             }
573             fclose(fp);
574 
575             return max_freq_khz;
576         }
577     }
578 
579     int max_freq_khz = 0;
580     while (!feof(fp))
581     {
582         int freq_khz = 0;
583         int nscan = fscanf(fp, "%d %*d", &freq_khz);
584         if (nscan != 1)
585             break;
586 
587         if (freq_khz > max_freq_khz)
588             max_freq_khz = freq_khz;
589     }
590 
591     fclose(fp);
592 
593     return max_freq_khz;
594 }
595 
set_sched_affinity(const CpuSet & thread_affinity_mask)596 static int set_sched_affinity(const CpuSet& thread_affinity_mask)
597 {
598     // set affinity for thread
599 #if defined(__GLIBC__) || defined(__OHOS__)
600     pid_t pid = syscall(SYS_gettid);
601 #else
602 #if defined(PI3) || (defined(__MUSL__) && __MUSL_MINOR__ <= 14)
603     pid_t pid = getpid();
604 #else
605     pid_t pid = gettid();
606 #endif
607 #endif
608 
609     int syscallret = syscall(__NR_sched_setaffinity, pid, sizeof(cpu_set_t), &thread_affinity_mask.cpu_set);
610     if (syscallret)
611     {
612         NCNN_LOGE("syscall error %d", syscallret);
613         return -1;
614     }
615 
616     return 0;
617 }
618 #endif // defined __ANDROID__ || defined __linux__
619 
620 #if __APPLE__
set_sched_affinity(const CpuSet & thread_affinity_mask)621 static int set_sched_affinity(const CpuSet& thread_affinity_mask)
622 {
623     // https://developer.apple.com/library/archive/releasenotes/Performance/RN-AffinityAPI/index.html
624     // http://www.hybridkernel.com/2015/01/18/binding_threads_to_cores_osx.html
625     // https://gist.github.com/Coneko/4234842
626 
627     // This is a quite outdated document. Apple will not allow developers to set CPU affinity.
628     // In OS X 10.5 it worked, later it became a suggestion to OS X, then in 10.10 or so (as well in later ones), macOS will ignore any affinity settings.
629     // see https://github.com/Tencent/ncnn/pull/2335#discussion_r528233919   --- AmeAkio
630 
631     int affinity_tag = THREAD_AFFINITY_TAG_NULL;
632     for (int i = 0; i < (int)sizeof(thread_affinity_mask.policy) * 8; i++)
633     {
634         if (thread_affinity_mask.is_enabled(i))
635         {
636             affinity_tag = i + 1;
637             break;
638         }
639     }
640 
641     mach_port_t tid = pthread_mach_thread_np(pthread_self());
642 
643     thread_affinity_policy_data_t policy_data;
644     policy_data.affinity_tag = affinity_tag;
645     int ret = thread_policy_set(tid, THREAD_AFFINITY_POLICY, (thread_policy_t)&policy_data, THREAD_AFFINITY_POLICY_COUNT);
646     if (ret && ret != KERN_NOT_SUPPORTED)
647     {
648         NCNN_LOGE("thread_policy_set error %d", ret);
649         return -1;
650     }
651 
652     return 0;
653 }
654 #endif // __APPLE__
655 
656 static int g_powersave = 0;
657 
get_cpu_powersave()658 int get_cpu_powersave()
659 {
660     return g_powersave;
661 }
662 
set_cpu_powersave(int powersave)663 int set_cpu_powersave(int powersave)
664 {
665     if (powersave < 0 || powersave > 2)
666     {
667         NCNN_LOGE("powersave %d not supported", powersave);
668         return -1;
669     }
670 
671     const CpuSet& thread_affinity_mask = get_cpu_thread_affinity_mask(powersave);
672 
673     int ret = set_cpu_thread_affinity(thread_affinity_mask);
674     if (ret != 0)
675         return ret;
676 
677     g_powersave = powersave;
678 
679     return 0;
680 }
681 
682 static CpuSet g_thread_affinity_mask_all;
683 static CpuSet g_thread_affinity_mask_little;
684 static CpuSet g_thread_affinity_mask_big;
685 
setup_thread_affinity_masks()686 static int setup_thread_affinity_masks()
687 {
688     g_thread_affinity_mask_all.disable_all();
689 
690 #if defined __ANDROID__ || defined __linux__
691     int max_freq_khz_min = INT_MAX;
692     int max_freq_khz_max = 0;
693     std::vector<int> cpu_max_freq_khz(g_cpucount);
694     for (int i = 0; i < g_cpucount; i++)
695     {
696         int max_freq_khz = get_max_freq_khz(i);
697 
698         //         NCNN_LOGE("%d max freq = %d khz", i, max_freq_khz);
699 
700         cpu_max_freq_khz[i] = max_freq_khz;
701 
702         if (max_freq_khz > max_freq_khz_max)
703             max_freq_khz_max = max_freq_khz;
704         if (max_freq_khz < max_freq_khz_min)
705             max_freq_khz_min = max_freq_khz;
706     }
707 
708     int max_freq_khz_medium = (max_freq_khz_min + max_freq_khz_max) / 2;
709     if (max_freq_khz_medium == max_freq_khz_max)
710     {
711         g_thread_affinity_mask_little.disable_all();
712         g_thread_affinity_mask_big = g_thread_affinity_mask_all;
713         return 0;
714     }
715 
716     for (int i = 0; i < g_cpucount; i++)
717     {
718         if (cpu_max_freq_khz[i] < max_freq_khz_medium)
719             g_thread_affinity_mask_little.enable(i);
720         else
721             g_thread_affinity_mask_big.enable(i);
722     }
723 #elif __APPLE__
724     // affinity info from cpu model
725     if (g_hw_cpufamily == CPUFAMILY_ARM_MONSOON_MISTRAL)
726     {
727         // 2 + 4
728         g_thread_affinity_mask_big.enable(0);
729         g_thread_affinity_mask_big.enable(1);
730         g_thread_affinity_mask_little.enable(2);
731         g_thread_affinity_mask_little.enable(3);
732         g_thread_affinity_mask_little.enable(4);
733         g_thread_affinity_mask_little.enable(5);
734     }
735     else if (g_hw_cpufamily == CPUFAMILY_ARM_VORTEX_TEMPEST || g_hw_cpufamily == CPUFAMILY_ARM_LIGHTNING_THUNDER || g_hw_cpufamily == CPUFAMILY_ARM_FIRESTORM_ICESTORM)
736     {
737         // 2 + 4 or 4 + 4
738         if (get_cpu_count() == 6)
739         {
740             g_thread_affinity_mask_big.enable(0);
741             g_thread_affinity_mask_big.enable(1);
742             g_thread_affinity_mask_little.enable(2);
743             g_thread_affinity_mask_little.enable(3);
744             g_thread_affinity_mask_little.enable(4);
745             g_thread_affinity_mask_little.enable(5);
746         }
747         else
748         {
749             g_thread_affinity_mask_big.enable(0);
750             g_thread_affinity_mask_big.enable(1);
751             g_thread_affinity_mask_big.enable(2);
752             g_thread_affinity_mask_big.enable(3);
753             g_thread_affinity_mask_little.enable(4);
754             g_thread_affinity_mask_little.enable(5);
755             g_thread_affinity_mask_little.enable(6);
756             g_thread_affinity_mask_little.enable(7);
757         }
758     }
759     else
760     {
761         // smp models
762         g_thread_affinity_mask_little.disable_all();
763         g_thread_affinity_mask_big = g_thread_affinity_mask_all;
764     }
765 #else
766     // TODO implement me for other platforms
767     g_thread_affinity_mask_little.disable_all();
768     g_thread_affinity_mask_big = g_thread_affinity_mask_all;
769 #endif
770 
771     return 0;
772 }
773 
get_cpu_thread_affinity_mask(int powersave)774 const CpuSet& get_cpu_thread_affinity_mask(int powersave)
775 {
776     setup_thread_affinity_masks();
777 
778     if (powersave == 0)
779         return g_thread_affinity_mask_all;
780 
781     if (powersave == 1)
782         return g_thread_affinity_mask_little;
783 
784     if (powersave == 2)
785         return g_thread_affinity_mask_big;
786 
787     NCNN_LOGE("powersave %d not supported", powersave);
788 
789     // fallback to all cores anyway
790     return g_thread_affinity_mask_all;
791 }
792 
set_cpu_thread_affinity(const CpuSet & thread_affinity_mask)793 int set_cpu_thread_affinity(const CpuSet& thread_affinity_mask)
794 {
795 #if defined __ANDROID__ || defined __linux__
796     int num_threads = thread_affinity_mask.num_enabled();
797 
798 #ifdef _OPENMP
799     // set affinity for each thread
800     set_omp_num_threads(num_threads);
801     std::vector<int> ssarets(num_threads, 0);
802     #pragma omp parallel for num_threads(num_threads)
803     for (int i = 0; i < num_threads; i++)
804     {
805         ssarets[i] = set_sched_affinity(thread_affinity_mask);
806     }
807     for (int i = 0; i < num_threads; i++)
808     {
809         if (ssarets[i] != 0)
810             return -1;
811     }
812 #else
813     int ssaret = set_sched_affinity(thread_affinity_mask);
814     if (ssaret != 0)
815         return -1;
816 #endif
817 
818     return 0;
819 #elif __APPLE__
820 
821 #ifdef _OPENMP
822     int num_threads = thread_affinity_mask.num_enabled();
823 
824     // set affinity for each thread
825     set_omp_num_threads(num_threads);
826     std::vector<int> ssarets(num_threads, 0);
827     #pragma omp parallel for num_threads(num_threads)
828     for (int i = 0; i < num_threads; i++)
829     {
830         // assign one core for each thread
831         int core = -1 - i;
832         for (int j = 0; j < (int)sizeof(thread_affinity_mask.policy) * 8; j++)
833         {
834             if (thread_affinity_mask.is_enabled(j))
835             {
836                 if (core == -1)
837                 {
838                     core = j;
839                     break;
840                 }
841                 else
842                 {
843                     core++;
844                 }
845             }
846         }
847         CpuSet this_thread_affinity_mask;
848         if (core != -1 - i)
849         {
850             this_thread_affinity_mask.enable(core);
851         }
852 
853         ssarets[i] = set_sched_affinity(this_thread_affinity_mask);
854     }
855     for (int i = 0; i < num_threads; i++)
856     {
857         if (ssarets[i] != 0)
858             return -1;
859     }
860 #else
861     int ssaret = set_sched_affinity(thread_affinity_mask);
862     if (ssaret != 0)
863         return -1;
864 #endif
865 
866     return 0;
867 #else
868     // TODO
869     (void)thread_affinity_mask;
870     return -1;
871 #endif
872 }
873 
get_omp_num_threads()874 int get_omp_num_threads()
875 {
876 #ifdef _OPENMP
877     return omp_get_num_threads();
878 #else
879     return 1;
880 #endif
881 }
882 
set_omp_num_threads(int num_threads)883 void set_omp_num_threads(int num_threads)
884 {
885 #ifdef _OPENMP
886     omp_set_num_threads(num_threads);
887 #else
888     (void)num_threads;
889 #endif
890 }
891 
get_omp_dynamic()892 int get_omp_dynamic()
893 {
894 #ifdef _OPENMP
895     return omp_get_dynamic();
896 #else
897     return 0;
898 #endif
899 }
900 
set_omp_dynamic(int dynamic)901 void set_omp_dynamic(int dynamic)
902 {
903 #ifdef _OPENMP
904     omp_set_dynamic(dynamic);
905 #else
906     (void)dynamic;
907 #endif
908 }
909 
get_omp_thread_num()910 int get_omp_thread_num()
911 {
912 #ifdef _OPENMP
913     return omp_get_thread_num();
914 #else
915     return 0;
916 #endif
917 }
918 
get_kmp_blocktime()919 int get_kmp_blocktime()
920 {
921 #if defined(_OPENMP) && __clang__
922     return kmp_get_blocktime();
923 #else
924     return 0;
925 #endif
926 }
927 
set_kmp_blocktime(int time_ms)928 void set_kmp_blocktime(int time_ms)
929 {
930 #if defined(_OPENMP) && __clang__
931     kmp_set_blocktime(time_ms);
932 #else
933     (void)time_ms;
934 #endif
935 }
936 
937 static ncnn::ThreadLocalStorage tls_flush_denormals;
938 
get_flush_denormals()939 int get_flush_denormals()
940 {
941 #if defined(__SSE3__)
942     return (int)reinterpret_cast<size_t>(tls_flush_denormals.get());
943 #else
944     return 0;
945 #endif
946 }
947 
set_flush_denormals(int flush_denormals)948 int set_flush_denormals(int flush_denormals)
949 {
950     if (flush_denormals < 0 || flush_denormals > 3)
951     {
952         NCNN_LOGE("denormals_zero %d not supported", flush_denormals);
953         return -1;
954     }
955 #if defined(__SSE3__)
956     if (flush_denormals == 0)
957     {
958         _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_OFF);
959         _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF);
960     }
961     else if (flush_denormals == 1)
962     {
963         _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
964         _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF);
965     }
966     else if (flush_denormals == 2)
967     {
968         _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_OFF);
969         _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
970     }
971     else if (flush_denormals == 3)
972     {
973         _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
974         _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
975     }
976 
977     tls_flush_denormals.set(reinterpret_cast<void*>((size_t)flush_denormals));
978     return 0;
979 #else
980     return 0;
981 #endif
982 }
983 
984 } // namespace ncnn
985