1 /*******************************************************************************
2 * Copyright 2020-2021 Intel Corporation
3 * Copyright 2020 FUJITSU LIMITED
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *******************************************************************************/
17
18 #if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
19 #include <algorithm>
20
21 #if defined(_WIN32)
22 #include <windows.h>
23 #elif defined(__GLIBC__)
24 #include <sched.h>
25 #endif
26 #endif
27
28 #include "cpu/platform.hpp"
29
30 #if DNNL_X64
31 #include "cpu/x64/cpu_isa_traits.hpp"
32 #elif DNNL_AARCH64
33 #include "cpu/aarch64/cpu_isa_traits.hpp"
34 #endif
35
36 // For DNNL_X64 build we compute the timestamp using rdtsc. Use std::chrono for
37 // other builds.
38 #if !DNNL_X64
39 #include <chrono>
40 #endif
41
42 namespace dnnl {
43 namespace impl {
44 namespace cpu {
45 namespace platform {
46
get_isa_info()47 const char *get_isa_info() {
48 #if DNNL_X64
49 return x64::get_isa_info();
50 #elif DNNL_AARCH64
51 return aarch64::get_isa_info();
52 #else
53 return "Generic";
54 #endif
55 }
56
get_effective_cpu_isa()57 dnnl_cpu_isa_t get_effective_cpu_isa() {
58 #if DNNL_X64
59 return x64::get_effective_cpu_isa();
60 #elif DNNL_AARCH64
61 return aarch64::get_effective_cpu_isa();
62 #else
63 return dnnl_cpu_isa_all;
64 #endif
65 }
66
set_max_cpu_isa(dnnl_cpu_isa_t isa)67 status_t set_max_cpu_isa(dnnl_cpu_isa_t isa) {
68 #if DNNL_X64
69 return x64::set_max_cpu_isa(isa);
70 #else
71 return status::unimplemented;
72 #endif
73 }
74
set_cpu_isa_hints(dnnl_cpu_isa_hints_t isa_hints)75 status_t set_cpu_isa_hints(dnnl_cpu_isa_hints_t isa_hints) {
76 #if DNNL_X64
77 return x64::set_cpu_isa_hints(isa_hints);
78 #else
79 return status::unimplemented;
80 #endif
81 }
82
get_cpu_isa_hints()83 dnnl_cpu_isa_hints_t get_cpu_isa_hints() {
84 #if DNNL_X64
85 return x64::get_cpu_isa_hints();
86 #else
87 return dnnl_cpu_isa_no_hints;
88 #endif
89 }
90
prefer_ymm_requested()91 bool prefer_ymm_requested() {
92 #if DNNL_X64
93 const bool prefer_ymm = x64::get_cpu_isa_hints() == dnnl_cpu_isa_prefer_ymm;
94 return prefer_ymm;
95 #else
96 return false;
97 #endif
98 }
99
has_data_type_support(data_type_t data_type)100 bool has_data_type_support(data_type_t data_type) {
101 switch (data_type) {
102 case data_type::bf16:
103 #if DNNL_X64
104 return x64::mayiuse(x64::avx512_core);
105 #else
106 return false;
107 #endif
108 case data_type::f16: return false;
109 default: return true;
110 }
111 }
112
s8s8_weights_scale_factor()113 float s8s8_weights_scale_factor() {
114 #if DNNL_X64
115 return x64::mayiuse(x64::avx512_core_vnni) ? 1.0f : 0.5f;
116 #else
117 return 1.0f;
118 #endif
119 }
120
get_per_core_cache_size(int level)121 unsigned get_per_core_cache_size(int level) {
122 auto guess = [](int level) {
123 switch (level) {
124 case 1: return 32U * 1024;
125 case 2: return 512U * 1024;
126 case 3: return 1024U * 1024;
127 default: return 0U;
128 }
129 };
130
131 #if DNNL_X64
132 using namespace x64;
133 if (cpu().getDataCacheLevels() == 0) return guess(level);
134
135 if (level > 0 && (unsigned)level <= cpu().getDataCacheLevels()) {
136 unsigned l = level - 1;
137 return cpu().getDataCacheSize(l) / cpu().getCoresSharingDataCache(l);
138 } else
139 return 0;
140 #else
141 return guess(level);
142 #endif
143 }
144
get_num_cores()145 unsigned get_num_cores() {
146 #if DNNL_X64
147 return x64::cpu().getNumCores(Xbyak::util::CoreLevel);
148 #else
149 return 1;
150 #endif
151 }
152
153 #if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
154 // The purpose of this function is to return the potential maximum number of
155 // threads in user's threadpool. It is assumed that the number of threads in an
156 // actual threadpool will not exceed the number cores in a socket reported by
157 // the OS, which may or may not be equal to the number of total physical cores
158 // in a socket depending on the OS configuration (read -- VM environment). In
159 // order to simulate the number of cores available in such environment, this
160 // function supports process affinity.
get_max_threads_to_use()161 unsigned get_max_threads_to_use() {
162 int num_cores_per_socket = (int)dnnl::impl::cpu::platform::get_num_cores();
163 #if defined(_WIN32)
164 DWORD_PTR proc_affinity_mask;
165 DWORD_PTR sys_affinity_mask;
166 if (GetProcessAffinityMask(
167 GetCurrentProcess(), &proc_affinity_mask, &sys_affinity_mask)) {
168 int masked_nthr = 0;
169 for (int i = 0; i < CHAR_BIT * sizeof(proc_affinity_mask);
170 i++, proc_affinity_mask >>= 1)
171 masked_nthr += proc_affinity_mask & 1;
172 return std::min(masked_nthr, num_cores_per_socket);
173 }
174 #elif defined(__GLIBC__)
175 cpu_set_t cpu_set;
176 // Check if the affinity of the process has been set using, e.g.,
177 // numactl.
178 if (::sched_getaffinity(0, sizeof(cpu_set_t), &cpu_set) == 0)
179 return std::min(CPU_COUNT(&cpu_set), num_cores_per_socket);
180 #endif
181 return num_cores_per_socket;
182 }
183 #endif
184
get_vector_register_size()185 int get_vector_register_size() {
186 #if DNNL_X64
187 using namespace x64;
188 if (mayiuse(avx512_common)) return cpu_isa_traits<avx512_common>::vlen;
189 if (mayiuse(avx)) return cpu_isa_traits<avx>::vlen;
190 if (mayiuse(sse41)) return cpu_isa_traits<sse41>::vlen;
191 #elif DNNL_AARCH64
192 using namespace aarch64;
193 if (mayiuse(asimd)) return cpu_isa_traits<asimd>::vlen;
194 if (mayiuse(sve_512)) return cpu_isa_traits<sve_512>::vlen;
195 #endif
196 return 0;
197 }
198
199 /* The purpose of this function is to provide a very efficient timestamp
200 * calculation (used primarily for primitive cache). For DNNL_X64, this can be
201 * accomplished using *rdtsc* since it provides a timestamp value that (i) is
202 * independent for each core, and (ii) is synchronized across cores in multiple
203 * sockets.
204 * TODO: For now, use std::chrono::steady_clock for other builds, however
205 * another more optimized function may be called here.
206 */
get_timestamp()207 size_t get_timestamp() {
208 #if DNNL_X64
209 return static_cast<size_t>(Xbyak::util::Clock::getRdtsc());
210 #else
211 return static_cast<size_t>(
212 std::chrono::steady_clock::now().time_since_epoch().count());
213 #endif
214 }
215
216 } // namespace platform
217 } // namespace cpu
218 } // namespace impl
219 } // namespace dnnl
220