1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14
15 #include "cpu.h"
16
17 #include "platform.h"
18
19 #include <limits.h>
20 #include <stdio.h>
21 #include <string.h>
22
23 #ifdef _OPENMP
24 #if NCNN_SIMPLEOMP
25 #include "simpleomp.h"
26 #else
27 #include <omp.h>
28 #endif
29 #endif
30
31 #ifdef _MSC_VER
32 #include <intrin.h> // __cpuid()
33 #include <immintrin.h> // _xgetbv()
34 #endif
35
36 #ifdef __EMSCRIPTEN__
37 #include <emscripten/threading.h>
38 #endif
39
40 #if defined __ANDROID__ || defined __linux__
41 #include <stdint.h>
42 #include <sys/syscall.h>
43 #include <unistd.h>
44 #endif
45
46 #if __APPLE__
47 #include <mach/mach.h>
48 #include <mach/machine.h>
49 #include <mach/thread_act.h>
50 #include <sys/sysctl.h>
51 #include <sys/types.h>
52 #include "TargetConditionals.h"
53 #if TARGET_OS_IPHONE
54 #define __IOS__ 1
55 #endif
56 // define missing cpu model for old sdk
57 #ifndef CPUFAMILY_ARM_HURRICANE
58 #define CPUFAMILY_ARM_HURRICANE 0x67ceee93
59 #endif
60 #ifndef CPUFAMILY_ARM_MONSOON_MISTRAL
61 #define CPUFAMILY_ARM_MONSOON_MISTRAL 0xe81e7ef6
62 #endif
63 #ifndef CPUFAMILY_ARM_VORTEX_TEMPEST
64 #define CPUFAMILY_ARM_VORTEX_TEMPEST 0x07d34b9f
65 #endif
66 #ifndef CPUFAMILY_ARM_LIGHTNING_THUNDER
67 #define CPUFAMILY_ARM_LIGHTNING_THUNDER 0x462504d2
68 #endif
69 #ifndef CPUFAMILY_ARM_FIRESTORM_ICESTORM
70 #define CPUFAMILY_ARM_FIRESTORM_ICESTORM 0x1b588bb3
71 #endif
72 #endif
73
74 #if defined(__SSE3__)
75 #include <immintrin.h>
76 #endif
77
78 namespace ncnn {
79
80 #if defined __ANDROID__ || defined __linux__
81
82 // extract the ELF HW capabilities bitmap from /proc/self/auxv
get_elf_hwcap_from_proc_self_auxv()83 static unsigned int get_elf_hwcap_from_proc_self_auxv()
84 {
85 FILE* fp = fopen("/proc/self/auxv", "rb");
86 if (!fp)
87 {
88 return 0;
89 }
90
91 #define AT_HWCAP 16
92 #define AT_HWCAP2 26
93 #if __aarch64__ || __riscv_xlen == 64
94 struct
95 {
96 uint64_t tag;
97 uint64_t value;
98 } entry;
99 #else
100 struct
101 {
102 unsigned int tag;
103 unsigned int value;
104 } entry;
105
106 #endif
107
108 unsigned int result = 0;
109 while (!feof(fp))
110 {
111 int nread = fread((char*)&entry, sizeof(entry), 1, fp);
112 if (nread != 1)
113 break;
114
115 if (entry.tag == 0 && entry.value == 0)
116 break;
117
118 if (entry.tag == AT_HWCAP)
119 {
120 result = entry.value;
121 break;
122 }
123 }
124
125 fclose(fp);
126
127 return result;
128 }
129
130 static unsigned int g_hwcaps = get_elf_hwcap_from_proc_self_auxv();
131
132 #if __aarch64__
133 // from arch/arm64/include/uapi/asm/hwcap.h
134 #define HWCAP_ASIMD (1 << 1)
135 #define HWCAP_ASIMDHP (1 << 10)
136 #define HWCAP_ASIMDDP (1 << 20)
137 #else
138 // from arch/arm/include/uapi/asm/hwcap.h
139 #define HWCAP_NEON (1 << 12)
140 #define HWCAP_VFPv4 (1 << 16)
141 #endif
142
143 #if __riscv
144 // from arch/riscv/include/uapi/asm/hwcap.h
145 #define COMPAT_HWCAP_ISA_V (1 << ('V' - 'A'))
146 #endif
147
148 #endif // defined __ANDROID__ || defined __linux__
149
150 #if __APPLE__
get_hw_cpufamily()151 static unsigned int get_hw_cpufamily()
152 {
153 unsigned int value = 0;
154 size_t len = sizeof(value);
155 sysctlbyname("hw.cpufamily", &value, &len, NULL, 0);
156 return value;
157 }
158
get_hw_cputype()159 static cpu_type_t get_hw_cputype()
160 {
161 cpu_type_t value = 0;
162 size_t len = sizeof(value);
163 sysctlbyname("hw.cputype", &value, &len, NULL, 0);
164 return value;
165 }
166
get_hw_cpusubtype()167 static cpu_subtype_t get_hw_cpusubtype()
168 {
169 cpu_subtype_t value = 0;
170 size_t len = sizeof(value);
171 sysctlbyname("hw.cpusubtype", &value, &len, NULL, 0);
172 return value;
173 }
174
175 static unsigned int g_hw_cpufamily = get_hw_cpufamily();
176 static cpu_type_t g_hw_cputype = get_hw_cputype();
177 static cpu_subtype_t g_hw_cpusubtype = get_hw_cpusubtype();
178 #endif // __APPLE__
179
180 #if defined __ANDROID__ || defined __linux__
CpuSet()181 CpuSet::CpuSet()
182 {
183 disable_all();
184 }
185
enable(int cpu)186 void CpuSet::enable(int cpu)
187 {
188 CPU_SET(cpu, &cpu_set);
189 }
190
disable(int cpu)191 void CpuSet::disable(int cpu)
192 {
193 CPU_CLR(cpu, &cpu_set);
194 }
195
disable_all()196 void CpuSet::disable_all()
197 {
198 CPU_ZERO(&cpu_set);
199 }
200
is_enabled(int cpu) const201 bool CpuSet::is_enabled(int cpu) const
202 {
203 return CPU_ISSET(cpu, &cpu_set);
204 }
205
num_enabled() const206 int CpuSet::num_enabled() const
207 {
208 int num_enabled = 0;
209 for (int i = 0; i < (int)sizeof(cpu_set_t) * 8; i++)
210 {
211 if (is_enabled(i))
212 num_enabled++;
213 }
214
215 return num_enabled;
216 }
217 #elif __APPLE__
CpuSet()218 CpuSet::CpuSet()
219 {
220 disable_all();
221 }
222
enable(int cpu)223 void CpuSet::enable(int cpu)
224 {
225 policy |= (1 << cpu);
226 }
227
disable(int cpu)228 void CpuSet::disable(int cpu)
229 {
230 policy &= ~(1 << cpu);
231 }
232
disable_all()233 void CpuSet::disable_all()
234 {
235 policy = 0;
236 }
237
is_enabled(int cpu) const238 bool CpuSet::is_enabled(int cpu) const
239 {
240 return policy & (1 << cpu);
241 }
242
num_enabled() const243 int CpuSet::num_enabled() const
244 {
245 int num_enabled = 0;
246 for (int i = 0; i < (int)sizeof(policy) * 8; i++)
247 {
248 if (is_enabled(i))
249 num_enabled++;
250 }
251
252 return num_enabled;
253 }
254 #else
CpuSet()255 CpuSet::CpuSet()
256 {
257 }
258
enable(int)259 void CpuSet::enable(int /* cpu */)
260 {
261 }
262
disable(int)263 void CpuSet::disable(int /* cpu */)
264 {
265 }
266
disable_all()267 void CpuSet::disable_all()
268 {
269 }
270
is_enabled(int) const271 bool CpuSet::is_enabled(int /* cpu */) const
272 {
273 return true;
274 }
275
num_enabled() const276 int CpuSet::num_enabled() const
277 {
278 return get_cpu_count();
279 }
280 #endif
281
cpu_support_arm_neon()282 int cpu_support_arm_neon()
283 {
284 #if defined __ANDROID__ || defined __linux__
285 #if __aarch64__
286 return g_hwcaps & HWCAP_ASIMD;
287 #else
288 return g_hwcaps & HWCAP_NEON;
289 #endif
290 #elif __APPLE__
291 #if __aarch64__
292 return g_hw_cputype == CPU_TYPE_ARM64;
293 #else
294 return g_hw_cputype == CPU_TYPE_ARM && g_hw_cpusubtype > CPU_SUBTYPE_ARM_V7;
295 #endif
296 #else
297 return 0;
298 #endif
299 }
300
cpu_support_arm_vfpv4()301 int cpu_support_arm_vfpv4()
302 {
303 #if defined __ANDROID__ || defined __linux__
304 #if __aarch64__
305 // neon always enable fma and fp16
306 return g_hwcaps & HWCAP_ASIMD;
307 #else
308 return g_hwcaps & HWCAP_VFPv4;
309 #endif
310 #elif __APPLE__
311 #if __aarch64__
312 return g_hw_cputype == CPU_TYPE_ARM64;
313 #else
314 return g_hw_cputype == CPU_TYPE_ARM && g_hw_cpusubtype > CPU_SUBTYPE_ARM_V7S;
315 #endif
316 #else
317 return 0;
318 #endif
319 }
320
cpu_support_arm_asimdhp()321 int cpu_support_arm_asimdhp()
322 {
323 #if defined __ANDROID__ || defined __linux__
324 #if __aarch64__
325 return g_hwcaps & HWCAP_ASIMDHP;
326 #else
327 return 0;
328 #endif
329 #elif __APPLE__
330 #if __aarch64__
331 return g_hw_cpufamily == CPUFAMILY_ARM_MONSOON_MISTRAL || g_hw_cpufamily == CPUFAMILY_ARM_VORTEX_TEMPEST || g_hw_cpufamily == CPUFAMILY_ARM_LIGHTNING_THUNDER || g_hw_cpufamily == CPUFAMILY_ARM_FIRESTORM_ICESTORM;
332 #else
333 return 0;
334 #endif
335 #else
336 return 0;
337 #endif
338 }
339
cpu_support_arm_asimddp()340 int cpu_support_arm_asimddp()
341 {
342 #if defined __ANDROID__ || defined __linux__
343 #if __aarch64__
344 return g_hwcaps & HWCAP_ASIMDDP;
345 #else
346 return 0;
347 #endif
348 #elif __APPLE__
349 #if __aarch64__
350 return g_hw_cpufamily == CPUFAMILY_ARM_MONSOON_MISTRAL || g_hw_cpufamily == CPUFAMILY_ARM_VORTEX_TEMPEST || g_hw_cpufamily == CPUFAMILY_ARM_LIGHTNING_THUNDER || g_hw_cpufamily == CPUFAMILY_ARM_FIRESTORM_ICESTORM;
351 #else
352 return 0;
353 #endif
354 #else
355 return 0;
356 #endif
357 }
358
cpu_support_x86_avx2()359 int cpu_support_x86_avx2()
360 {
361 #if (_M_AMD64 || __x86_64__) || (_M_IX86 || __i386__)
362 #if defined(_MSC_VER)
363 // TODO move to init function
364 int cpu_info[4];
365 __cpuid(cpu_info, 0);
366
367 int nIds = cpu_info[0];
368 if (nIds < 7)
369 return 0;
370
371 __cpuid(cpu_info, 1);
372 // check AVX XSAVE OSXSAVE
373 if (!(cpu_info[2] & 0x10000000) || !(cpu_info[2] & 0x04000000) || !(cpu_info[2] & 0x08000000))
374 return 0;
375
376 // check XSAVE enabled by kernel
377 if ((_xgetbv(0) & 6) != 6)
378 return 0;
379
380 __cpuid(cpu_info, 7);
381 return cpu_info[1] & 0x00000020;
382 #elif defined(__clang__)
383 #if __clang_major__ >= 6
384 __builtin_cpu_init();
385 #endif
386 return __builtin_cpu_supports("avx2");
387 #elif __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
388 __builtin_cpu_init();
389 return __builtin_cpu_supports("avx2");
390 #else
391 // TODO: other x86 compilers checking avx2 here
392 NCNN_LOGE("AVX2 detection method is unknown for current compiler");
393 return 0;
394 #endif
395 #else
396 return 0;
397 #endif
398 }
399
cpu_support_riscv_v()400 int cpu_support_riscv_v()
401 {
402 #if defined __ANDROID__ || defined __linux__
403 #if __riscv
404 return g_hwcaps & COMPAT_HWCAP_ISA_V;
405 #else
406 return 0;
407 #endif
408 #else
409 return 0;
410 #endif
411 }
412
cpu_support_riscv_zfh()413 int cpu_support_riscv_zfh()
414 {
415 #if __riscv
416 #if __riscv_zfh
417 // https://github.com/riscv/riscv-zfinx/blob/master/Zfinx_spec.adoc#5-discovery
418 __fp16 a = 0;
419 asm volatile(
420 "fneg.h %0, %0 \n"
421 : "=f"(a)
422 : "0"(a)
423 :);
424 union
425 {
426 __fp16 a;
427 unsigned short u;
428 } tmp;
429 tmp.a = a;
430 return tmp.u != 0 ? 1 : 0;
431 #else
432 return 0;
433 #endif
434 #else
435 return 0;
436 #endif
437 }
438
cpu_riscv_vlenb()439 int cpu_riscv_vlenb()
440 {
441 #if __riscv
442 if (!cpu_support_riscv_v())
443 return 0;
444
445 int a = 0;
446 asm volatile(
447 ".word 0xc22026f3 \n" // csrr a3, vlenb
448 "mv %0, a3 \n"
449 : "=r"(a)
450 :
451 : "memory", "a3");
452 return a;
453 #else
454 return 0;
455 #endif
456 }
457
get_cpucount()458 static int get_cpucount()
459 {
460 int count = 0;
461 #ifdef __EMSCRIPTEN__
462 if (emscripten_has_threading_support())
463 count = emscripten_num_logical_cores();
464 else
465 count = 1;
466 #elif defined __ANDROID__ || defined __linux__
467 // get cpu count from /proc/cpuinfo
468 FILE* fp = fopen("/proc/cpuinfo", "rb");
469 if (!fp)
470 return 1;
471
472 char line[1024];
473 while (!feof(fp))
474 {
475 char* s = fgets(line, 1024, fp);
476 if (!s)
477 break;
478
479 if (memcmp(line, "processor", 9) == 0)
480 {
481 count++;
482 }
483 }
484
485 fclose(fp);
486 #elif __APPLE__
487 size_t len = sizeof(count);
488 sysctlbyname("hw.ncpu", &count, &len, NULL, 0);
489 #else
490 #ifdef _OPENMP
491 count = omp_get_max_threads();
492 #else
493 count = 1;
494 #endif // _OPENMP
495 #endif
496
497 if (count < 1)
498 count = 1;
499
500 return count;
501 }
502
503 static int g_cpucount = get_cpucount();
504
get_cpu_count()505 int get_cpu_count()
506 {
507 return g_cpucount;
508 }
509
get_little_cpu_count()510 int get_little_cpu_count()
511 {
512 return get_cpu_thread_affinity_mask(1).num_enabled();
513 }
514
get_big_cpu_count()515 int get_big_cpu_count()
516 {
517 int big_cpu_count = get_cpu_thread_affinity_mask(2).num_enabled();
518 return big_cpu_count ? big_cpu_count : g_cpucount;
519 }
520
521 #if defined __ANDROID__ || defined __linux__
get_max_freq_khz(int cpuid)522 static int get_max_freq_khz(int cpuid)
523 {
524 // first try, for all possible cpu
525 char path[256];
526 sprintf(path, "/sys/devices/system/cpu/cpufreq/stats/cpu%d/time_in_state", cpuid);
527
528 FILE* fp = fopen(path, "rb");
529
530 if (!fp)
531 {
532 // second try, for online cpu
533 sprintf(path, "/sys/devices/system/cpu/cpu%d/cpufreq/stats/time_in_state", cpuid);
534 fp = fopen(path, "rb");
535
536 if (fp)
537 {
538 int max_freq_khz = 0;
539 while (!feof(fp))
540 {
541 int freq_khz = 0;
542 int nscan = fscanf(fp, "%d %*d", &freq_khz);
543 if (nscan != 1)
544 break;
545
546 if (freq_khz > max_freq_khz)
547 max_freq_khz = freq_khz;
548 }
549
550 fclose(fp);
551
552 if (max_freq_khz != 0)
553 return max_freq_khz;
554
555 fp = NULL;
556 }
557
558 if (!fp)
559 {
560 // third try, for online cpu
561 sprintf(path, "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq", cpuid);
562 fp = fopen(path, "rb");
563
564 if (!fp)
565 return -1;
566
567 int max_freq_khz = -1;
568 int nscan = fscanf(fp, "%d", &max_freq_khz);
569 if (nscan != 1)
570 {
571 NCNN_LOGE("fscanf cpuinfo_max_freq error %d", nscan);
572 }
573 fclose(fp);
574
575 return max_freq_khz;
576 }
577 }
578
579 int max_freq_khz = 0;
580 while (!feof(fp))
581 {
582 int freq_khz = 0;
583 int nscan = fscanf(fp, "%d %*d", &freq_khz);
584 if (nscan != 1)
585 break;
586
587 if (freq_khz > max_freq_khz)
588 max_freq_khz = freq_khz;
589 }
590
591 fclose(fp);
592
593 return max_freq_khz;
594 }
595
set_sched_affinity(const CpuSet & thread_affinity_mask)596 static int set_sched_affinity(const CpuSet& thread_affinity_mask)
597 {
598 // set affinity for thread
599 #if defined(__GLIBC__) || defined(__OHOS__)
600 pid_t pid = syscall(SYS_gettid);
601 #else
602 #if defined(PI3) || (defined(__MUSL__) && __MUSL_MINOR__ <= 14)
603 pid_t pid = getpid();
604 #else
605 pid_t pid = gettid();
606 #endif
607 #endif
608
609 int syscallret = syscall(__NR_sched_setaffinity, pid, sizeof(cpu_set_t), &thread_affinity_mask.cpu_set);
610 if (syscallret)
611 {
612 NCNN_LOGE("syscall error %d", syscallret);
613 return -1;
614 }
615
616 return 0;
617 }
618 #endif // defined __ANDROID__ || defined __linux__
619
620 #if __APPLE__
set_sched_affinity(const CpuSet & thread_affinity_mask)621 static int set_sched_affinity(const CpuSet& thread_affinity_mask)
622 {
623 // https://developer.apple.com/library/archive/releasenotes/Performance/RN-AffinityAPI/index.html
624 // http://www.hybridkernel.com/2015/01/18/binding_threads_to_cores_osx.html
625 // https://gist.github.com/Coneko/4234842
626
627 // This is a quite outdated document. Apple will not allow developers to set CPU affinity.
628 // In OS X 10.5 it worked, later it became a suggestion to OS X, then in 10.10 or so (as well in later ones), macOS will ignore any affinity settings.
629 // see https://github.com/Tencent/ncnn/pull/2335#discussion_r528233919 --- AmeAkio
630
631 int affinity_tag = THREAD_AFFINITY_TAG_NULL;
632 for (int i = 0; i < (int)sizeof(thread_affinity_mask.policy) * 8; i++)
633 {
634 if (thread_affinity_mask.is_enabled(i))
635 {
636 affinity_tag = i + 1;
637 break;
638 }
639 }
640
641 mach_port_t tid = pthread_mach_thread_np(pthread_self());
642
643 thread_affinity_policy_data_t policy_data;
644 policy_data.affinity_tag = affinity_tag;
645 int ret = thread_policy_set(tid, THREAD_AFFINITY_POLICY, (thread_policy_t)&policy_data, THREAD_AFFINITY_POLICY_COUNT);
646 if (ret && ret != KERN_NOT_SUPPORTED)
647 {
648 NCNN_LOGE("thread_policy_set error %d", ret);
649 return -1;
650 }
651
652 return 0;
653 }
654 #endif // __APPLE__
655
656 static int g_powersave = 0;
657
get_cpu_powersave()658 int get_cpu_powersave()
659 {
660 return g_powersave;
661 }
662
set_cpu_powersave(int powersave)663 int set_cpu_powersave(int powersave)
664 {
665 if (powersave < 0 || powersave > 2)
666 {
667 NCNN_LOGE("powersave %d not supported", powersave);
668 return -1;
669 }
670
671 const CpuSet& thread_affinity_mask = get_cpu_thread_affinity_mask(powersave);
672
673 int ret = set_cpu_thread_affinity(thread_affinity_mask);
674 if (ret != 0)
675 return ret;
676
677 g_powersave = powersave;
678
679 return 0;
680 }
681
682 static CpuSet g_thread_affinity_mask_all;
683 static CpuSet g_thread_affinity_mask_little;
684 static CpuSet g_thread_affinity_mask_big;
685
setup_thread_affinity_masks()686 static int setup_thread_affinity_masks()
687 {
688 g_thread_affinity_mask_all.disable_all();
689
690 #if defined __ANDROID__ || defined __linux__
691 int max_freq_khz_min = INT_MAX;
692 int max_freq_khz_max = 0;
693 std::vector<int> cpu_max_freq_khz(g_cpucount);
694 for (int i = 0; i < g_cpucount; i++)
695 {
696 int max_freq_khz = get_max_freq_khz(i);
697
698 // NCNN_LOGE("%d max freq = %d khz", i, max_freq_khz);
699
700 cpu_max_freq_khz[i] = max_freq_khz;
701
702 if (max_freq_khz > max_freq_khz_max)
703 max_freq_khz_max = max_freq_khz;
704 if (max_freq_khz < max_freq_khz_min)
705 max_freq_khz_min = max_freq_khz;
706 }
707
708 int max_freq_khz_medium = (max_freq_khz_min + max_freq_khz_max) / 2;
709 if (max_freq_khz_medium == max_freq_khz_max)
710 {
711 g_thread_affinity_mask_little.disable_all();
712 g_thread_affinity_mask_big = g_thread_affinity_mask_all;
713 return 0;
714 }
715
716 for (int i = 0; i < g_cpucount; i++)
717 {
718 if (cpu_max_freq_khz[i] < max_freq_khz_medium)
719 g_thread_affinity_mask_little.enable(i);
720 else
721 g_thread_affinity_mask_big.enable(i);
722 }
723 #elif __APPLE__
724 // affinity info from cpu model
725 if (g_hw_cpufamily == CPUFAMILY_ARM_MONSOON_MISTRAL)
726 {
727 // 2 + 4
728 g_thread_affinity_mask_big.enable(0);
729 g_thread_affinity_mask_big.enable(1);
730 g_thread_affinity_mask_little.enable(2);
731 g_thread_affinity_mask_little.enable(3);
732 g_thread_affinity_mask_little.enable(4);
733 g_thread_affinity_mask_little.enable(5);
734 }
735 else if (g_hw_cpufamily == CPUFAMILY_ARM_VORTEX_TEMPEST || g_hw_cpufamily == CPUFAMILY_ARM_LIGHTNING_THUNDER || g_hw_cpufamily == CPUFAMILY_ARM_FIRESTORM_ICESTORM)
736 {
737 // 2 + 4 or 4 + 4
738 if (get_cpu_count() == 6)
739 {
740 g_thread_affinity_mask_big.enable(0);
741 g_thread_affinity_mask_big.enable(1);
742 g_thread_affinity_mask_little.enable(2);
743 g_thread_affinity_mask_little.enable(3);
744 g_thread_affinity_mask_little.enable(4);
745 g_thread_affinity_mask_little.enable(5);
746 }
747 else
748 {
749 g_thread_affinity_mask_big.enable(0);
750 g_thread_affinity_mask_big.enable(1);
751 g_thread_affinity_mask_big.enable(2);
752 g_thread_affinity_mask_big.enable(3);
753 g_thread_affinity_mask_little.enable(4);
754 g_thread_affinity_mask_little.enable(5);
755 g_thread_affinity_mask_little.enable(6);
756 g_thread_affinity_mask_little.enable(7);
757 }
758 }
759 else
760 {
761 // smp models
762 g_thread_affinity_mask_little.disable_all();
763 g_thread_affinity_mask_big = g_thread_affinity_mask_all;
764 }
765 #else
766 // TODO implement me for other platforms
767 g_thread_affinity_mask_little.disable_all();
768 g_thread_affinity_mask_big = g_thread_affinity_mask_all;
769 #endif
770
771 return 0;
772 }
773
get_cpu_thread_affinity_mask(int powersave)774 const CpuSet& get_cpu_thread_affinity_mask(int powersave)
775 {
776 setup_thread_affinity_masks();
777
778 if (powersave == 0)
779 return g_thread_affinity_mask_all;
780
781 if (powersave == 1)
782 return g_thread_affinity_mask_little;
783
784 if (powersave == 2)
785 return g_thread_affinity_mask_big;
786
787 NCNN_LOGE("powersave %d not supported", powersave);
788
789 // fallback to all cores anyway
790 return g_thread_affinity_mask_all;
791 }
792
set_cpu_thread_affinity(const CpuSet & thread_affinity_mask)793 int set_cpu_thread_affinity(const CpuSet& thread_affinity_mask)
794 {
795 #if defined __ANDROID__ || defined __linux__
796 int num_threads = thread_affinity_mask.num_enabled();
797
798 #ifdef _OPENMP
799 // set affinity for each thread
800 set_omp_num_threads(num_threads);
801 std::vector<int> ssarets(num_threads, 0);
802 #pragma omp parallel for num_threads(num_threads)
803 for (int i = 0; i < num_threads; i++)
804 {
805 ssarets[i] = set_sched_affinity(thread_affinity_mask);
806 }
807 for (int i = 0; i < num_threads; i++)
808 {
809 if (ssarets[i] != 0)
810 return -1;
811 }
812 #else
813 int ssaret = set_sched_affinity(thread_affinity_mask);
814 if (ssaret != 0)
815 return -1;
816 #endif
817
818 return 0;
819 #elif __APPLE__
820
821 #ifdef _OPENMP
822 int num_threads = thread_affinity_mask.num_enabled();
823
824 // set affinity for each thread
825 set_omp_num_threads(num_threads);
826 std::vector<int> ssarets(num_threads, 0);
827 #pragma omp parallel for num_threads(num_threads)
828 for (int i = 0; i < num_threads; i++)
829 {
830 // assign one core for each thread
831 int core = -1 - i;
832 for (int j = 0; j < (int)sizeof(thread_affinity_mask.policy) * 8; j++)
833 {
834 if (thread_affinity_mask.is_enabled(j))
835 {
836 if (core == -1)
837 {
838 core = j;
839 break;
840 }
841 else
842 {
843 core++;
844 }
845 }
846 }
847 CpuSet this_thread_affinity_mask;
848 if (core != -1 - i)
849 {
850 this_thread_affinity_mask.enable(core);
851 }
852
853 ssarets[i] = set_sched_affinity(this_thread_affinity_mask);
854 }
855 for (int i = 0; i < num_threads; i++)
856 {
857 if (ssarets[i] != 0)
858 return -1;
859 }
860 #else
861 int ssaret = set_sched_affinity(thread_affinity_mask);
862 if (ssaret != 0)
863 return -1;
864 #endif
865
866 return 0;
867 #else
868 // TODO
869 (void)thread_affinity_mask;
870 return -1;
871 #endif
872 }
873
get_omp_num_threads()874 int get_omp_num_threads()
875 {
876 #ifdef _OPENMP
877 return omp_get_num_threads();
878 #else
879 return 1;
880 #endif
881 }
882
set_omp_num_threads(int num_threads)883 void set_omp_num_threads(int num_threads)
884 {
885 #ifdef _OPENMP
886 omp_set_num_threads(num_threads);
887 #else
888 (void)num_threads;
889 #endif
890 }
891
get_omp_dynamic()892 int get_omp_dynamic()
893 {
894 #ifdef _OPENMP
895 return omp_get_dynamic();
896 #else
897 return 0;
898 #endif
899 }
900
set_omp_dynamic(int dynamic)901 void set_omp_dynamic(int dynamic)
902 {
903 #ifdef _OPENMP
904 omp_set_dynamic(dynamic);
905 #else
906 (void)dynamic;
907 #endif
908 }
909
get_omp_thread_num()910 int get_omp_thread_num()
911 {
912 #ifdef _OPENMP
913 return omp_get_thread_num();
914 #else
915 return 0;
916 #endif
917 }
918
get_kmp_blocktime()919 int get_kmp_blocktime()
920 {
921 #if defined(_OPENMP) && __clang__
922 return kmp_get_blocktime();
923 #else
924 return 0;
925 #endif
926 }
927
set_kmp_blocktime(int time_ms)928 void set_kmp_blocktime(int time_ms)
929 {
930 #if defined(_OPENMP) && __clang__
931 kmp_set_blocktime(time_ms);
932 #else
933 (void)time_ms;
934 #endif
935 }
936
937 static ncnn::ThreadLocalStorage tls_flush_denormals;
938
get_flush_denormals()939 int get_flush_denormals()
940 {
941 #if defined(__SSE3__)
942 return (int)reinterpret_cast<size_t>(tls_flush_denormals.get());
943 #else
944 return 0;
945 #endif
946 }
947
set_flush_denormals(int flush_denormals)948 int set_flush_denormals(int flush_denormals)
949 {
950 if (flush_denormals < 0 || flush_denormals > 3)
951 {
952 NCNN_LOGE("denormals_zero %d not supported", flush_denormals);
953 return -1;
954 }
955 #if defined(__SSE3__)
956 if (flush_denormals == 0)
957 {
958 _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_OFF);
959 _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF);
960 }
961 else if (flush_denormals == 1)
962 {
963 _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
964 _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF);
965 }
966 else if (flush_denormals == 2)
967 {
968 _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_OFF);
969 _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
970 }
971 else if (flush_denormals == 3)
972 {
973 _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
974 _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
975 }
976
977 tls_flush_denormals.set(reinterpret_cast<void*>((size_t)flush_denormals));
978 return 0;
979 #else
980 return 0;
981 #endif
982 }
983
984 } // namespace ncnn
985