1 // Copyright 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "hwy/targets.h"
16
17 #include <stdarg.h>
18 #include <stddef.h>
19 #include <stdint.h>
20 #include <stdio.h>
21
22 #include <atomic>
23
24 #include "hwy/base.h"
25
26 #if HWY_IS_ASAN || HWY_IS_MSAN || HWY_IS_TSAN
27 #include "sanitizer/common_interface_defs.h" // __sanitizer_print_stack_trace
28 #endif
29
30 #include <stdlib.h> // abort / exit
31
32 #if HWY_ARCH_X86
33 #include <xmmintrin.h>
34 #if HWY_COMPILER_MSVC
35 #include <intrin.h>
36 #else // !HWY_COMPILER_MSVC
37 #include <cpuid.h>
38 #endif // HWY_COMPILER_MSVC
39 #endif // HWY_ARCH_X86
40
41 namespace hwy {
42 namespace {
43
44 #if HWY_ARCH_X86
45
IsBitSet(const uint32_t reg,const int index)46 HWY_INLINE bool IsBitSet(const uint32_t reg, const int index) {
47 return (reg & (1U << index)) != 0;
48 }
49
50 // Calls CPUID instruction with eax=level and ecx=count and returns the result
51 // in abcd array where abcd = {eax, ebx, ecx, edx} (hence the name abcd).
Cpuid(const uint32_t level,const uint32_t count,uint32_t * HWY_RESTRICT abcd)52 HWY_INLINE void Cpuid(const uint32_t level, const uint32_t count,
53 uint32_t* HWY_RESTRICT abcd) {
54 #if HWY_COMPILER_MSVC
55 int regs[4];
56 __cpuidex(regs, level, count);
57 for (int i = 0; i < 4; ++i) {
58 abcd[i] = regs[i];
59 }
60 #else // HWY_COMPILER_MSVC
61 uint32_t a;
62 uint32_t b;
63 uint32_t c;
64 uint32_t d;
65 __cpuid_count(level, count, a, b, c, d);
66 abcd[0] = a;
67 abcd[1] = b;
68 abcd[2] = c;
69 abcd[3] = d;
70 #endif // HWY_COMPILER_MSVC
71 }
72
73 // Returns the lower 32 bits of extended control register 0.
74 // Requires CPU support for "OSXSAVE" (see below).
ReadXCR0()75 uint32_t ReadXCR0() {
76 #if HWY_COMPILER_MSVC
77 return static_cast<uint32_t>(_xgetbv(0));
78 #else // HWY_COMPILER_MSVC
79 uint32_t xcr0, xcr0_high;
80 const uint32_t index = 0;
81 asm volatile(".byte 0x0F, 0x01, 0xD0"
82 : "=a"(xcr0), "=d"(xcr0_high)
83 : "c"(index));
84 return xcr0;
85 #endif // HWY_COMPILER_MSVC
86 }
87
88 #endif // HWY_ARCH_X86
89
90 // Not function-local => no compiler-generated locking.
91 std::atomic<uint32_t> supported_{0}; // Not yet initialized
92
93 // When running tests, this value can be set to the mocked supported targets
94 // mask. Only written to from a single thread before the test starts.
95 uint32_t supported_targets_for_test_ = 0;
96
97 // Mask of targets disabled at runtime with DisableTargets.
98 uint32_t supported_mask_{LimitsMax<uint32_t>()};
99
100 #if HWY_ARCH_X86
101 // Arbritrary bit indices indicating which instruction set extensions are
102 // supported. Use enum to ensure values are distinct.
103 enum class FeatureIndex : uint32_t {
104 kSSE = 0,
105 kSSE2,
106 kSSE3,
107 kSSSE3,
108
109 kSSE41,
110 kSSE42,
111 kCLMUL,
112 kAES,
113
114 kAVX,
115 kAVX2,
116 kF16C,
117 kFMA,
118 kLZCNT,
119 kBMI,
120 kBMI2,
121
122 kAVX512F,
123 kAVX512VL,
124 kAVX512DQ,
125 kAVX512BW,
126
127 kVNNI,
128 kVPCLMULQDQ,
129 kVBMI2,
130 kVAES,
131 kPOPCNTDQ,
132 kBITALG,
133
134 kSentinel
135 };
136 static_assert(static_cast<size_t>(FeatureIndex::kSentinel) < 64,
137 "Too many bits for u64");
138
Bit(FeatureIndex index)139 HWY_INLINE constexpr uint64_t Bit(FeatureIndex index) {
140 return 1ull << static_cast<size_t>(index);
141 }
142
143 constexpr uint64_t kGroupSSSE3 =
144 Bit(FeatureIndex::kSSE) | Bit(FeatureIndex::kSSE2) |
145 Bit(FeatureIndex::kSSE3) | Bit(FeatureIndex::kSSSE3);
146
147 constexpr uint64_t kGroupSSE4 =
148 Bit(FeatureIndex::kSSE41) | Bit(FeatureIndex::kSSE42) |
149 Bit(FeatureIndex::kCLMUL) | Bit(FeatureIndex::kAES) | kGroupSSSE3;
150
151 // We normally assume BMI/BMI2/FMA are available if AVX2 is. This allows us to
152 // use BZHI and (compiler-generated) MULX. However, VirtualBox lacks them
153 // [https://www.virtualbox.org/ticket/15471]. Thus we provide the option of
154 // avoiding using and requiring these so AVX2 can still be used.
155 #ifdef HWY_DISABLE_BMI2_FMA
156 constexpr uint64_t kGroupBMI2_FMA = 0;
157 #else
158 constexpr uint64_t kGroupBMI2_FMA = Bit(FeatureIndex::kBMI) |
159 Bit(FeatureIndex::kBMI2) |
160 Bit(FeatureIndex::kFMA);
161 #endif
162
163 #ifdef HWY_DISABLE_F16C
164 constexpr uint64_t kGroupF16C = 0;
165 #else
166 constexpr uint64_t kGroupF16C = Bit(FeatureIndex::kF16C);
167 #endif
168
169 constexpr uint64_t kGroupAVX2 =
170 Bit(FeatureIndex::kAVX) | Bit(FeatureIndex::kAVX2) |
171 Bit(FeatureIndex::kLZCNT) | kGroupBMI2_FMA | kGroupF16C | kGroupSSE4;
172
173 constexpr uint64_t kGroupAVX3 =
174 Bit(FeatureIndex::kAVX512F) | Bit(FeatureIndex::kAVX512VL) |
175 Bit(FeatureIndex::kAVX512DQ) | Bit(FeatureIndex::kAVX512BW) | kGroupAVX2;
176
177 constexpr uint64_t kGroupAVX3_DL =
178 Bit(FeatureIndex::kVNNI) | Bit(FeatureIndex::kVPCLMULQDQ) |
179 Bit(FeatureIndex::kVBMI2) | Bit(FeatureIndex::kVAES) |
180 Bit(FeatureIndex::kPOPCNTDQ) | Bit(FeatureIndex::kBITALG) | kGroupAVX3;
181
182 #endif // HWY_ARCH_X86
183
184 } // namespace
185
186 HWY_NORETURN void HWY_FORMAT(3, 4)
Abort(const char * file,int line,const char * format,...)187 Abort(const char* file, int line, const char* format, ...) {
188 char buf[2000];
189 va_list args;
190 va_start(args, format);
191 vsnprintf(buf, sizeof(buf), format, args);
192 va_end(args);
193
194 fprintf(stderr, "Abort at %s:%d: %s\n", file, line, buf);
195
196 // If compiled with any sanitizer, they can also print a stack trace.
197 #if HWY_IS_ASAN || HWY_IS_MSAN || HWY_IS_TSAN
198 __sanitizer_print_stack_trace();
199 #endif // HWY_IS_*
200 fflush(stderr);
201
202 // Now terminate the program:
203 #if HWY_ARCH_RVV
204 exit(1); // trap/abort just freeze Spike.
205 #elif HWY_IS_DEBUG_BUILD && !HWY_COMPILER_MSVC
206 // Facilitates breaking into a debugger, but don't use this in non-debug
207 // builds because it looks like "illegal instruction", which is misleading.
208 __builtin_trap();
209 #else
210 abort(); // Compile error without this due to HWY_NORETURN.
211 #endif
212 }
213
DisableTargets(uint32_t disabled_targets)214 void DisableTargets(uint32_t disabled_targets) {
215 supported_mask_ = ~(disabled_targets & ~uint32_t(HWY_ENABLED_BASELINE));
216 // We can call Update() here to initialize the mask but that will trigger a
217 // call to SupportedTargets() which we use in tests to tell whether any of the
218 // highway dynamic dispatch functions were used.
219 GetChosenTarget().DeInit();
220 }
221
SetSupportedTargetsForTest(uint32_t targets)222 void SetSupportedTargetsForTest(uint32_t targets) {
223 // Reset the cached supported_ value to 0 to force a re-evaluation in the
224 // next call to SupportedTargets() which will use the mocked value set here
225 // if not zero.
226 supported_.store(0, std::memory_order_release);
227 supported_targets_for_test_ = targets;
228 GetChosenTarget().DeInit();
229 }
230
SupportedTargetsCalledForTest()231 bool SupportedTargetsCalledForTest() {
232 return supported_.load(std::memory_order_acquire) != 0;
233 }
234
SupportedTargets()235 uint32_t SupportedTargets() {
236 uint32_t bits = supported_.load(std::memory_order_acquire);
237 // Already initialized?
238 if (HWY_LIKELY(bits != 0)) {
239 return bits & supported_mask_;
240 }
241
242 // When running tests, this allows to mock the current supported targets.
243 if (HWY_UNLIKELY(supported_targets_for_test_ != 0)) {
244 // Store the value to signal that this was used.
245 supported_.store(supported_targets_for_test_, std::memory_order_release);
246 return supported_targets_for_test_ & supported_mask_;
247 }
248
249 bits = HWY_SCALAR;
250
251 #if HWY_ARCH_X86
252 bool has_osxsave = false;
253 { // ensures we do not accidentally use flags outside this block
254 uint64_t flags = 0;
255 uint32_t abcd[4];
256
257 Cpuid(0, 0, abcd);
258 const uint32_t max_level = abcd[0];
259
260 // Standard feature flags
261 Cpuid(1, 0, abcd);
262 flags |= IsBitSet(abcd[3], 25) ? Bit(FeatureIndex::kSSE) : 0;
263 flags |= IsBitSet(abcd[3], 26) ? Bit(FeatureIndex::kSSE2) : 0;
264 flags |= IsBitSet(abcd[2], 0) ? Bit(FeatureIndex::kSSE3) : 0;
265 flags |= IsBitSet(abcd[2], 1) ? Bit(FeatureIndex::kCLMUL) : 0;
266 flags |= IsBitSet(abcd[2], 9) ? Bit(FeatureIndex::kSSSE3) : 0;
267 flags |= IsBitSet(abcd[2], 12) ? Bit(FeatureIndex::kFMA) : 0;
268 flags |= IsBitSet(abcd[2], 19) ? Bit(FeatureIndex::kSSE41) : 0;
269 flags |= IsBitSet(abcd[2], 20) ? Bit(FeatureIndex::kSSE42) : 0;
270 flags |= IsBitSet(abcd[2], 25) ? Bit(FeatureIndex::kAES) : 0;
271 flags |= IsBitSet(abcd[2], 28) ? Bit(FeatureIndex::kAVX) : 0;
272 flags |= IsBitSet(abcd[2], 29) ? Bit(FeatureIndex::kF16C) : 0;
273 has_osxsave = IsBitSet(abcd[2], 27);
274
275 // Extended feature flags
276 Cpuid(0x80000001U, 0, abcd);
277 flags |= IsBitSet(abcd[2], 5) ? Bit(FeatureIndex::kLZCNT) : 0;
278
279 // Extended features
280 if (max_level >= 7) {
281 Cpuid(7, 0, abcd);
282 flags |= IsBitSet(abcd[1], 3) ? Bit(FeatureIndex::kBMI) : 0;
283 flags |= IsBitSet(abcd[1], 5) ? Bit(FeatureIndex::kAVX2) : 0;
284 flags |= IsBitSet(abcd[1], 8) ? Bit(FeatureIndex::kBMI2) : 0;
285
286 flags |= IsBitSet(abcd[1], 16) ? Bit(FeatureIndex::kAVX512F) : 0;
287 flags |= IsBitSet(abcd[1], 17) ? Bit(FeatureIndex::kAVX512DQ) : 0;
288 flags |= IsBitSet(abcd[1], 30) ? Bit(FeatureIndex::kAVX512BW) : 0;
289 flags |= IsBitSet(abcd[1], 31) ? Bit(FeatureIndex::kAVX512VL) : 0;
290
291 flags |= IsBitSet(abcd[2], 6) ? Bit(FeatureIndex::kVBMI2) : 0;
292 flags |= IsBitSet(abcd[2], 9) ? Bit(FeatureIndex::kVAES) : 0;
293 flags |= IsBitSet(abcd[2], 10) ? Bit(FeatureIndex::kVPCLMULQDQ) : 0;
294 flags |= IsBitSet(abcd[2], 11) ? Bit(FeatureIndex::kVNNI) : 0;
295 flags |= IsBitSet(abcd[2], 12) ? Bit(FeatureIndex::kBITALG) : 0;
296 flags |= IsBitSet(abcd[2], 14) ? Bit(FeatureIndex::kPOPCNTDQ) : 0;
297 }
298
299 // Set target bit(s) if all their group's flags are all set.
300 if ((flags & kGroupAVX3_DL) == kGroupAVX3_DL) {
301 bits |= HWY_AVX3_DL;
302 }
303 if ((flags & kGroupAVX3) == kGroupAVX3) {
304 bits |= HWY_AVX3;
305 }
306 if ((flags & kGroupAVX2) == kGroupAVX2) {
307 bits |= HWY_AVX2;
308 }
309 if ((flags & kGroupSSE4) == kGroupSSE4) {
310 bits |= HWY_SSE4;
311 }
312 if ((flags & kGroupSSSE3) == kGroupSSSE3) {
313 bits |= HWY_SSSE3;
314 }
315 }
316
317 // Clear bits if the OS does not support XSAVE - otherwise, registers
318 // are not preserved across context switches.
319 if (has_osxsave) {
320 const uint32_t xcr0 = ReadXCR0();
321 // XMM
322 if (!IsBitSet(xcr0, 1)) {
323 bits &=
324 ~uint32_t(HWY_SSSE3 | HWY_SSE4 | HWY_AVX2 | HWY_AVX3 | HWY_AVX3_DL);
325 }
326 // YMM
327 if (!IsBitSet(xcr0, 2)) {
328 bits &= ~uint32_t(HWY_AVX2 | HWY_AVX3 | HWY_AVX3_DL);
329 }
330 // ZMM + opmask
331 if ((xcr0 & 0x70) != 0x70) {
332 bits &= ~uint32_t(HWY_AVX3 | HWY_AVX3_DL);
333 }
334 }
335
336 #else
337 // TODO(janwas): detect for other platforms
338 bits = HWY_ENABLED_BASELINE;
339 #endif // HWY_ARCH_X86
340
341 if ((bits & HWY_ENABLED_BASELINE) != HWY_ENABLED_BASELINE) {
342 fprintf(stderr, "WARNING: CPU supports %zx but software requires %x\n",
343 size_t(bits), HWY_ENABLED_BASELINE);
344 }
345
346 supported_.store(bits, std::memory_order_release);
347 return bits & supported_mask_;
348 }
349
GetChosenTarget()350 HWY_DLLEXPORT ChosenTarget& GetChosenTarget() {
351 static ChosenTarget chosen_target;
352 return chosen_target;
353 }
354
Update()355 void ChosenTarget::Update() {
356 // The supported variable contains the current CPU supported targets shifted
357 // to the location expected by the ChosenTarget mask. We enabled SCALAR
358 // regardless of whether it was compiled since it is also used as the
359 // fallback mechanism to the baseline target.
360 uint32_t supported = HWY_CHOSEN_TARGET_SHIFT(hwy::SupportedTargets()) |
361 HWY_CHOSEN_TARGET_MASK_SCALAR;
362 mask_.store(supported);
363 }
364
365 } // namespace hwy
366