1 #include <stdbool.h>
2 #include <stddef.h>
3 #include <stdint.h>
4 
5 #include "blake3_impl.h"
6 
7 #if defined(IS_X86)
8 #if defined(_MSC_VER)
9 #include <intrin.h>
10 #elif defined(__GNUC__)
11 #include <immintrin.h>
12 #else
13 #error "Unimplemented!"
14 #endif
15 #endif
16 
17 #define MAYBE_UNUSED(x) (void)((x))
18 
19 #if defined(IS_X86)
xgetbv(void)20 static uint64_t xgetbv(void) {
21 #if defined(_MSC_VER)
22   return _xgetbv(0);
23 #else
24   uint32_t eax = 0, edx = 0;
25   __asm__ __volatile__("xgetbv\n" : "=a"(eax), "=d"(edx) : "c"(0));
26   return ((uint64_t)edx << 32) | eax;
27 #endif
28 }
29 
cpuid(uint32_t out[4],uint32_t id)30 static void cpuid(uint32_t out[4], uint32_t id) {
31 #if defined(_MSC_VER)
32   __cpuid((int *)out, id);
33 #elif defined(__i386__) || defined(_M_IX86)
34   __asm__ __volatile__("movl %%ebx, %1\n"
35                        "cpuid\n"
36                        "xchgl %1, %%ebx\n"
37                        : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
38                        : "a"(id));
39 #else
40   __asm__ __volatile__("cpuid\n"
41                        : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
42                        : "a"(id));
43 #endif
44 }
45 
cpuidex(uint32_t out[4],uint32_t id,uint32_t sid)46 static void cpuidex(uint32_t out[4], uint32_t id, uint32_t sid) {
47 #if defined(_MSC_VER)
48   __cpuidex((int *)out, id, sid);
49 #elif defined(__i386__) || defined(_M_IX86)
50   __asm__ __volatile__("movl %%ebx, %1\n"
51                        "cpuid\n"
52                        "xchgl %1, %%ebx\n"
53                        : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
54                        : "a"(id), "c"(sid));
55 #else
56   __asm__ __volatile__("cpuid\n"
57                        : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
58                        : "a"(id), "c"(sid));
59 #endif
60 }
61 
62 #endif
63 
64 enum cpu_feature {
65   SSE2 = 1 << 0,
66   SSSE3 = 1 << 1,
67   SSE41 = 1 << 2,
68   AVX = 1 << 3,
69   AVX2 = 1 << 4,
70   AVX512F = 1 << 5,
71   AVX512VL = 1 << 6,
72   /* ... */
73   UNDEFINED = 1 << 30
74 };
75 
76 #if !defined(BLAKE3_TESTING)
77 static /* Allow the variable to be controlled manually for testing */
78 #endif
79     enum cpu_feature g_cpu_features = UNDEFINED;
80 
81 LLVM_ATTRIBUTE_USED
82 #if !defined(BLAKE3_TESTING)
83 static
84 #endif
85     enum cpu_feature
get_cpu_features(void)86     get_cpu_features(void) {
87 
88   if (g_cpu_features != UNDEFINED) {
89     return g_cpu_features;
90   } else {
91 #if defined(IS_X86)
92     uint32_t regs[4] = {0};
93     uint32_t *eax = &regs[0], *ebx = &regs[1], *ecx = &regs[2], *edx = &regs[3];
94     (void)edx;
95     enum cpu_feature features = 0;
96     cpuid(regs, 0);
97     const int max_id = *eax;
98     cpuid(regs, 1);
99 #if defined(__amd64__) || defined(_M_X64)
100     features |= SSE2;
101 #else
102     if (*edx & (1UL << 26))
103       features |= SSE2;
104 #endif
105     if (*ecx & (1UL << 0))
106       features |= SSSE3;
107     if (*ecx & (1UL << 19))
108       features |= SSE41;
109 
110     if (*ecx & (1UL << 27)) { // OSXSAVE
111       const uint64_t mask = xgetbv();
112       if ((mask & 6) == 6) { // SSE and AVX states
113         if (*ecx & (1UL << 28))
114           features |= AVX;
115         if (max_id >= 7) {
116           cpuidex(regs, 7, 0);
117           if (*ebx & (1UL << 5))
118             features |= AVX2;
119           if ((mask & 224) == 224) { // Opmask, ZMM_Hi256, Hi16_Zmm
120             if (*ebx & (1UL << 31))
121               features |= AVX512VL;
122             if (*ebx & (1UL << 16))
123               features |= AVX512F;
124           }
125         }
126       }
127     }
128     g_cpu_features = features;
129     return features;
130 #else
131     /* How to detect NEON? */
132     return 0;
133 #endif
134   }
135 }
136 
blake3_compress_in_place(uint32_t cv[8],const uint8_t block[BLAKE3_BLOCK_LEN],uint8_t block_len,uint64_t counter,uint8_t flags)137 void blake3_compress_in_place(uint32_t cv[8],
138                               const uint8_t block[BLAKE3_BLOCK_LEN],
139                               uint8_t block_len, uint64_t counter,
140                               uint8_t flags) {
141 #if defined(IS_X86)
142   const enum cpu_feature features = get_cpu_features();
143   MAYBE_UNUSED(features);
144 #if !defined(BLAKE3_NO_AVX512)
145   if (features & AVX512VL) {
146     blake3_compress_in_place_avx512(cv, block, block_len, counter, flags);
147     return;
148   }
149 #endif
150 #if !defined(BLAKE3_NO_SSE41)
151   if (features & SSE41) {
152     blake3_compress_in_place_sse41(cv, block, block_len, counter, flags);
153     return;
154   }
155 #endif
156 #if !defined(BLAKE3_NO_SSE2)
157   if (features & SSE2) {
158     blake3_compress_in_place_sse2(cv, block, block_len, counter, flags);
159     return;
160   }
161 #endif
162 #endif
163   blake3_compress_in_place_portable(cv, block, block_len, counter, flags);
164 }
165 
blake3_compress_xof(const uint32_t cv[8],const uint8_t block[BLAKE3_BLOCK_LEN],uint8_t block_len,uint64_t counter,uint8_t flags,uint8_t out[64])166 void blake3_compress_xof(const uint32_t cv[8],
167                          const uint8_t block[BLAKE3_BLOCK_LEN],
168                          uint8_t block_len, uint64_t counter, uint8_t flags,
169                          uint8_t out[64]) {
170 #if defined(IS_X86)
171   const enum cpu_feature features = get_cpu_features();
172   MAYBE_UNUSED(features);
173 #if !defined(BLAKE3_NO_AVX512)
174   if (features & AVX512VL) {
175     blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out);
176     return;
177   }
178 #endif
179 #if !defined(BLAKE3_NO_SSE41)
180   if (features & SSE41) {
181     blake3_compress_xof_sse41(cv, block, block_len, counter, flags, out);
182     return;
183   }
184 #endif
185 #if !defined(BLAKE3_NO_SSE2)
186   if (features & SSE2) {
187     blake3_compress_xof_sse2(cv, block, block_len, counter, flags, out);
188     return;
189   }
190 #endif
191 #endif
192   blake3_compress_xof_portable(cv, block, block_len, counter, flags, out);
193 }
194 
blake3_hash_many(const uint8_t * const * inputs,size_t num_inputs,size_t blocks,const uint32_t key[8],uint64_t counter,bool increment_counter,uint8_t flags,uint8_t flags_start,uint8_t flags_end,uint8_t * out)195 void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
196                       size_t blocks, const uint32_t key[8], uint64_t counter,
197                       bool increment_counter, uint8_t flags,
198                       uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
199 #if defined(IS_X86)
200   const enum cpu_feature features = get_cpu_features();
201   MAYBE_UNUSED(features);
202 #if !defined(BLAKE3_NO_AVX512)
203   if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
204     blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter,
205                             increment_counter, flags, flags_start, flags_end,
206                             out);
207     return;
208   }
209 #endif
210 #if !defined(BLAKE3_NO_AVX2)
211   if (features & AVX2) {
212     blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter,
213                           increment_counter, flags, flags_start, flags_end,
214                           out);
215     return;
216   }
217 #endif
218 #if !defined(BLAKE3_NO_SSE41)
219   if (features & SSE41) {
220     blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter,
221                            increment_counter, flags, flags_start, flags_end,
222                            out);
223     return;
224   }
225 #endif
226 #if !defined(BLAKE3_NO_SSE2)
227   if (features & SSE2) {
228     blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter,
229                           increment_counter, flags, flags_start, flags_end,
230                           out);
231     return;
232   }
233 #endif
234 #endif
235 
236 #if BLAKE3_USE_NEON == 1
237   blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter,
238                         increment_counter, flags, flags_start, flags_end, out);
239   return;
240 #endif
241 
242   blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter,
243                             increment_counter, flags, flags_start, flags_end,
244                             out);
245 }
246 
247 // The dynamically detected SIMD degree of the current platform.
blake3_simd_degree(void)248 size_t blake3_simd_degree(void) {
249 #if defined(IS_X86)
250   const enum cpu_feature features = get_cpu_features();
251   MAYBE_UNUSED(features);
252 #if !defined(BLAKE3_NO_AVX512)
253   if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
254     return 16;
255   }
256 #endif
257 #if !defined(BLAKE3_NO_AVX2)
258   if (features & AVX2) {
259     return 8;
260   }
261 #endif
262 #if !defined(BLAKE3_NO_SSE41)
263   if (features & SSE41) {
264     return 4;
265   }
266 #endif
267 #if !defined(BLAKE3_NO_SSE2)
268   if (features & SSE2) {
269     return 4;
270   }
271 #endif
272 #endif
273 #if BLAKE3_USE_NEON == 1
274   return 4;
275 #endif
276   return 1;
277 }
278