1 // Copyright 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "hwy/targets.h"
16 
17 #include <stdarg.h>
18 #include <stddef.h>
19 #include <stdint.h>
20 #include <stdio.h>
21 
22 #include <atomic>
23 
24 #include "hwy/base.h"
25 
26 #if HWY_IS_ASAN || HWY_IS_MSAN || HWY_IS_TSAN
27 #include "sanitizer/common_interface_defs.h"  // __sanitizer_print_stack_trace
28 #endif
29 
30 #include <stdlib.h>  // abort / exit
31 
32 #if HWY_ARCH_X86
33 #include <xmmintrin.h>
34 #if HWY_COMPILER_MSVC
35 #include <intrin.h>
36 #else  // !HWY_COMPILER_MSVC
37 #include <cpuid.h>
38 #endif  // HWY_COMPILER_MSVC
39 #endif  // HWY_ARCH_X86
40 
41 namespace hwy {
42 namespace {
43 
44 #if HWY_ARCH_X86
45 
IsBitSet(const uint32_t reg,const int index)46 HWY_INLINE bool IsBitSet(const uint32_t reg, const int index) {
47   return (reg & (1U << index)) != 0;
48 }
49 
50 // Calls CPUID instruction with eax=level and ecx=count and returns the result
51 // in abcd array where abcd = {eax, ebx, ecx, edx} (hence the name abcd).
Cpuid(const uint32_t level,const uint32_t count,uint32_t * HWY_RESTRICT abcd)52 HWY_INLINE void Cpuid(const uint32_t level, const uint32_t count,
53                       uint32_t* HWY_RESTRICT abcd) {
54 #if HWY_COMPILER_MSVC
55   int regs[4];
56   __cpuidex(regs, level, count);
57   for (int i = 0; i < 4; ++i) {
58     abcd[i] = regs[i];
59   }
60 #else  // HWY_COMPILER_MSVC
61   uint32_t a;
62   uint32_t b;
63   uint32_t c;
64   uint32_t d;
65   __cpuid_count(level, count, a, b, c, d);
66   abcd[0] = a;
67   abcd[1] = b;
68   abcd[2] = c;
69   abcd[3] = d;
70 #endif  // HWY_COMPILER_MSVC
71 }
72 
73 // Returns the lower 32 bits of extended control register 0.
74 // Requires CPU support for "OSXSAVE" (see below).
ReadXCR0()75 uint32_t ReadXCR0() {
76 #if HWY_COMPILER_MSVC
77   return static_cast<uint32_t>(_xgetbv(0));
78 #else  // HWY_COMPILER_MSVC
79   uint32_t xcr0, xcr0_high;
80   const uint32_t index = 0;
81   asm volatile(".byte 0x0F, 0x01, 0xD0"
82                : "=a"(xcr0), "=d"(xcr0_high)
83                : "c"(index));
84   return xcr0;
85 #endif  // HWY_COMPILER_MSVC
86 }
87 
88 #endif  // HWY_ARCH_X86
89 
90 // Not function-local => no compiler-generated locking.
91 std::atomic<uint32_t> supported_{0};  // Not yet initialized
92 
93 // When running tests, this value can be set to the mocked supported targets
94 // mask. Only written to from a single thread before the test starts.
95 uint32_t supported_targets_for_test_ = 0;
96 
97 // Mask of targets disabled at runtime with DisableTargets.
98 uint32_t supported_mask_{LimitsMax<uint32_t>()};
99 
100 #if HWY_ARCH_X86
101 // Arbritrary bit indices indicating which instruction set extensions are
102 // supported. Use enum to ensure values are distinct.
103 enum class FeatureIndex : uint32_t {
104   kSSE = 0,
105   kSSE2,
106   kSSE3,
107   kSSSE3,
108 
109   kSSE41,
110   kSSE42,
111   kCLMUL,
112   kAES,
113 
114   kAVX,
115   kAVX2,
116   kF16C,
117   kFMA,
118   kLZCNT,
119   kBMI,
120   kBMI2,
121 
122   kAVX512F,
123   kAVX512VL,
124   kAVX512DQ,
125   kAVX512BW,
126 
127   kVNNI,
128   kVPCLMULQDQ,
129   kVBMI2,
130   kVAES,
131   kPOPCNTDQ,
132   kBITALG,
133 
134   kSentinel
135 };
136 static_assert(static_cast<size_t>(FeatureIndex::kSentinel) < 64,
137               "Too many bits for u64");
138 
Bit(FeatureIndex index)139 HWY_INLINE constexpr uint64_t Bit(FeatureIndex index) {
140   return 1ull << static_cast<size_t>(index);
141 }
142 
143 constexpr uint64_t kGroupSSSE3 =
144     Bit(FeatureIndex::kSSE) | Bit(FeatureIndex::kSSE2) |
145     Bit(FeatureIndex::kSSE3) | Bit(FeatureIndex::kSSSE3);
146 
147 constexpr uint64_t kGroupSSE4 =
148     Bit(FeatureIndex::kSSE41) | Bit(FeatureIndex::kSSE42) |
149     Bit(FeatureIndex::kCLMUL) | Bit(FeatureIndex::kAES) | kGroupSSSE3;
150 
151 // We normally assume BMI/BMI2/FMA are available if AVX2 is. This allows us to
152 // use BZHI and (compiler-generated) MULX. However, VirtualBox lacks them
153 // [https://www.virtualbox.org/ticket/15471]. Thus we provide the option of
154 // avoiding using and requiring these so AVX2 can still be used.
155 #ifdef HWY_DISABLE_BMI2_FMA
156 constexpr uint64_t kGroupBMI2_FMA = 0;
157 #else
158 constexpr uint64_t kGroupBMI2_FMA = Bit(FeatureIndex::kBMI) |
159                                     Bit(FeatureIndex::kBMI2) |
160                                     Bit(FeatureIndex::kFMA);
161 #endif
162 
163 #ifdef HWY_DISABLE_F16C
164 constexpr uint64_t kGroupF16C = 0;
165 #else
166 constexpr uint64_t kGroupF16C = Bit(FeatureIndex::kF16C);
167 #endif
168 
169 constexpr uint64_t kGroupAVX2 =
170     Bit(FeatureIndex::kAVX) | Bit(FeatureIndex::kAVX2) |
171     Bit(FeatureIndex::kLZCNT) | kGroupBMI2_FMA | kGroupF16C | kGroupSSE4;
172 
173 constexpr uint64_t kGroupAVX3 =
174     Bit(FeatureIndex::kAVX512F) | Bit(FeatureIndex::kAVX512VL) |
175     Bit(FeatureIndex::kAVX512DQ) | Bit(FeatureIndex::kAVX512BW) | kGroupAVX2;
176 
177 constexpr uint64_t kGroupAVX3_DL =
178     Bit(FeatureIndex::kVNNI) | Bit(FeatureIndex::kVPCLMULQDQ) |
179     Bit(FeatureIndex::kVBMI2) | Bit(FeatureIndex::kVAES) |
180     Bit(FeatureIndex::kPOPCNTDQ) | Bit(FeatureIndex::kBITALG) | kGroupAVX3;
181 
182 #endif  // HWY_ARCH_X86
183 
184 }  // namespace
185 
186 HWY_NORETURN void HWY_FORMAT(3, 4)
Abort(const char * file,int line,const char * format,...)187     Abort(const char* file, int line, const char* format, ...) {
188   char buf[2000];
189   va_list args;
190   va_start(args, format);
191   vsnprintf(buf, sizeof(buf), format, args);
192   va_end(args);
193 
194   fprintf(stderr, "Abort at %s:%d: %s\n", file, line, buf);
195 
196 // If compiled with any sanitizer, they can also print a stack trace.
197 #if HWY_IS_ASAN || HWY_IS_MSAN || HWY_IS_TSAN
198   __sanitizer_print_stack_trace();
199 #endif  // HWY_IS_*
200   fflush(stderr);
201 
202 // Now terminate the program:
203 #if HWY_ARCH_RVV
204   exit(1);  // trap/abort just freeze Spike.
205 #elif HWY_IS_DEBUG_BUILD && !HWY_COMPILER_MSVC
206   // Facilitates breaking into a debugger, but don't use this in non-debug
207   // builds because it looks like "illegal instruction", which is misleading.
208   __builtin_trap();
209 #else
210   abort();  // Compile error without this due to HWY_NORETURN.
211 #endif
212 }
213 
DisableTargets(uint32_t disabled_targets)214 void DisableTargets(uint32_t disabled_targets) {
215   supported_mask_ = ~(disabled_targets & ~uint32_t(HWY_ENABLED_BASELINE));
216   // We can call Update() here to initialize the mask but that will trigger a
217   // call to SupportedTargets() which we use in tests to tell whether any of the
218   // highway dynamic dispatch functions were used.
219   GetChosenTarget().DeInit();
220 }
221 
SetSupportedTargetsForTest(uint32_t targets)222 void SetSupportedTargetsForTest(uint32_t targets) {
223   // Reset the cached supported_ value to 0 to force a re-evaluation in the
224   // next call to SupportedTargets() which will use the mocked value set here
225   // if not zero.
226   supported_.store(0, std::memory_order_release);
227   supported_targets_for_test_ = targets;
228   GetChosenTarget().DeInit();
229 }
230 
SupportedTargetsCalledForTest()231 bool SupportedTargetsCalledForTest() {
232   return supported_.load(std::memory_order_acquire) != 0;
233 }
234 
SupportedTargets()235 uint32_t SupportedTargets() {
236   uint32_t bits = supported_.load(std::memory_order_acquire);
237   // Already initialized?
238   if (HWY_LIKELY(bits != 0)) {
239     return bits & supported_mask_;
240   }
241 
242   // When running tests, this allows to mock the current supported targets.
243   if (HWY_UNLIKELY(supported_targets_for_test_ != 0)) {
244     // Store the value to signal that this was used.
245     supported_.store(supported_targets_for_test_, std::memory_order_release);
246     return supported_targets_for_test_ & supported_mask_;
247   }
248 
249   bits = HWY_SCALAR;
250 
251 #if HWY_ARCH_X86
252   bool has_osxsave = false;
253   {  // ensures we do not accidentally use flags outside this block
254     uint64_t flags = 0;
255     uint32_t abcd[4];
256 
257     Cpuid(0, 0, abcd);
258     const uint32_t max_level = abcd[0];
259 
260     // Standard feature flags
261     Cpuid(1, 0, abcd);
262     flags |= IsBitSet(abcd[3], 25) ? Bit(FeatureIndex::kSSE) : 0;
263     flags |= IsBitSet(abcd[3], 26) ? Bit(FeatureIndex::kSSE2) : 0;
264     flags |= IsBitSet(abcd[2], 0) ? Bit(FeatureIndex::kSSE3) : 0;
265     flags |= IsBitSet(abcd[2], 1) ? Bit(FeatureIndex::kCLMUL) : 0;
266     flags |= IsBitSet(abcd[2], 9) ? Bit(FeatureIndex::kSSSE3) : 0;
267     flags |= IsBitSet(abcd[2], 12) ? Bit(FeatureIndex::kFMA) : 0;
268     flags |= IsBitSet(abcd[2], 19) ? Bit(FeatureIndex::kSSE41) : 0;
269     flags |= IsBitSet(abcd[2], 20) ? Bit(FeatureIndex::kSSE42) : 0;
270     flags |= IsBitSet(abcd[2], 25) ? Bit(FeatureIndex::kAES) : 0;
271     flags |= IsBitSet(abcd[2], 28) ? Bit(FeatureIndex::kAVX) : 0;
272     flags |= IsBitSet(abcd[2], 29) ? Bit(FeatureIndex::kF16C) : 0;
273     has_osxsave = IsBitSet(abcd[2], 27);
274 
275     // Extended feature flags
276     Cpuid(0x80000001U, 0, abcd);
277     flags |= IsBitSet(abcd[2], 5) ? Bit(FeatureIndex::kLZCNT) : 0;
278 
279     // Extended features
280     if (max_level >= 7) {
281       Cpuid(7, 0, abcd);
282       flags |= IsBitSet(abcd[1], 3) ? Bit(FeatureIndex::kBMI) : 0;
283       flags |= IsBitSet(abcd[1], 5) ? Bit(FeatureIndex::kAVX2) : 0;
284       flags |= IsBitSet(abcd[1], 8) ? Bit(FeatureIndex::kBMI2) : 0;
285 
286       flags |= IsBitSet(abcd[1], 16) ? Bit(FeatureIndex::kAVX512F) : 0;
287       flags |= IsBitSet(abcd[1], 17) ? Bit(FeatureIndex::kAVX512DQ) : 0;
288       flags |= IsBitSet(abcd[1], 30) ? Bit(FeatureIndex::kAVX512BW) : 0;
289       flags |= IsBitSet(abcd[1], 31) ? Bit(FeatureIndex::kAVX512VL) : 0;
290 
291       flags |= IsBitSet(abcd[2], 6) ? Bit(FeatureIndex::kVBMI2) : 0;
292       flags |= IsBitSet(abcd[2], 9) ? Bit(FeatureIndex::kVAES) : 0;
293       flags |= IsBitSet(abcd[2], 10) ? Bit(FeatureIndex::kVPCLMULQDQ) : 0;
294       flags |= IsBitSet(abcd[2], 11) ? Bit(FeatureIndex::kVNNI) : 0;
295       flags |= IsBitSet(abcd[2], 12) ? Bit(FeatureIndex::kBITALG) : 0;
296       flags |= IsBitSet(abcd[2], 14) ? Bit(FeatureIndex::kPOPCNTDQ) : 0;
297     }
298 
299     // Set target bit(s) if all their group's flags are all set.
300     if ((flags & kGroupAVX3_DL) == kGroupAVX3_DL) {
301       bits |= HWY_AVX3_DL;
302     }
303     if ((flags & kGroupAVX3) == kGroupAVX3) {
304       bits |= HWY_AVX3;
305     }
306     if ((flags & kGroupAVX2) == kGroupAVX2) {
307       bits |= HWY_AVX2;
308     }
309     if ((flags & kGroupSSE4) == kGroupSSE4) {
310       bits |= HWY_SSE4;
311     }
312     if ((flags & kGroupSSSE3) == kGroupSSSE3) {
313       bits |= HWY_SSSE3;
314     }
315   }
316 
317   // Clear bits if the OS does not support XSAVE - otherwise, registers
318   // are not preserved across context switches.
319   if (has_osxsave) {
320     const uint32_t xcr0 = ReadXCR0();
321     // XMM
322     if (!IsBitSet(xcr0, 1)) {
323       bits &=
324           ~uint32_t(HWY_SSSE3 | HWY_SSE4 | HWY_AVX2 | HWY_AVX3 | HWY_AVX3_DL);
325     }
326     // YMM
327     if (!IsBitSet(xcr0, 2)) {
328       bits &= ~uint32_t(HWY_AVX2 | HWY_AVX3 | HWY_AVX3_DL);
329     }
330     // ZMM + opmask
331     if ((xcr0 & 0x70) != 0x70) {
332       bits &= ~uint32_t(HWY_AVX3 | HWY_AVX3_DL);
333     }
334   }
335 
336 #else
337   // TODO(janwas): detect for other platforms
338   bits = HWY_ENABLED_BASELINE;
339 #endif  // HWY_ARCH_X86
340 
341   if ((bits & HWY_ENABLED_BASELINE) != HWY_ENABLED_BASELINE) {
342     fprintf(stderr, "WARNING: CPU supports %zx but software requires %x\n",
343             size_t(bits), HWY_ENABLED_BASELINE);
344   }
345 
346   supported_.store(bits, std::memory_order_release);
347   return bits & supported_mask_;
348 }
349 
GetChosenTarget()350 HWY_DLLEXPORT ChosenTarget& GetChosenTarget() {
351   static ChosenTarget chosen_target;
352   return chosen_target;
353 }
354 
Update()355 void ChosenTarget::Update() {
356   // The supported variable contains the current CPU supported targets shifted
357   // to the location expected by the ChosenTarget mask. We enabled SCALAR
358   // regardless of whether it was compiled since it is also used as the
359   // fallback mechanism to the baseline target.
360   uint32_t supported = HWY_CHOSEN_TARGET_SHIFT(hwy::SupportedTargets()) |
361                        HWY_CHOSEN_TARGET_MASK_SCALAR;
362   mask_.store(supported);
363 }
364 
365 }  // namespace hwy
366