1 #include <stdbool.h>
2 #include <stddef.h>
3 #include <stdint.h>
4 
5 #include "blake3_impl.h"
6 
7 #if defined(IS_X86)
8 #if defined(_MSC_VER)
9 #include <intrin.h>
10 #elif defined(__GNUC__)
11 #include <immintrin.h>
12 #else
13 #error "Unimplemented!"
14 #endif
15 #endif
16 
17 #if defined(IS_X86)
xgetbv()18 static uint64_t xgetbv() {
19 #if defined(_MSC_VER)
20   return _xgetbv(0);
21 #else
22   uint32_t eax = 0, edx = 0;
23   __asm__ __volatile__("xgetbv\n" : "=a"(eax), "=d"(edx) : "c"(0));
24   return ((uint64_t)edx << 32) | eax;
25 #endif
26 }
27 
cpuid(uint32_t out[4],uint32_t id)28 static void cpuid(uint32_t out[4], uint32_t id) {
29 #if defined(_MSC_VER)
30   __cpuid((int *)out, id);
31 #elif defined(__i386__) || defined(_M_IX86)
32   __asm__ __volatile__("movl %%ebx, %1\n"
33                        "cpuid\n"
34                        "xchgl %1, %%ebx\n"
35                        : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
36                        : "a"(id));
37 #else
38   __asm__ __volatile__("cpuid\n"
39                        : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
40                        : "a"(id));
41 #endif
42 }
43 
cpuidex(uint32_t out[4],uint32_t id,uint32_t sid)44 static void cpuidex(uint32_t out[4], uint32_t id, uint32_t sid) {
45 #if defined(_MSC_VER)
46   __cpuidex((int *)out, id, sid);
47 #elif defined(__i386__) || defined(_M_IX86)
48   __asm__ __volatile__("movl %%ebx, %1\n"
49                        "cpuid\n"
50                        "xchgl %1, %%ebx\n"
51                        : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
52                        : "a"(id), "c"(sid));
53 #else
54   __asm__ __volatile__("cpuid\n"
55                        : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
56                        : "a"(id), "c"(sid));
57 #endif
58 }
59 
60 #endif
61 
62 enum cpu_feature {
63   SSE2 = 1 << 0,
64   SSSE3 = 1 << 1,
65   SSE41 = 1 << 2,
66   AVX = 1 << 3,
67   AVX2 = 1 << 4,
68   AVX512F = 1 << 5,
69   AVX512VL = 1 << 6,
70   /* ... */
71   UNDEFINED = 1 << 30
72 };
73 
74 #if !defined(BLAKE3_TESTING)
75 static /* Allow the variable to be controlled manually for testing */
76 #endif
77     enum cpu_feature g_cpu_features = UNDEFINED;
78 
79 #if !defined(BLAKE3_TESTING)
80 static
81 #endif
82     enum cpu_feature
get_cpu_features()83     get_cpu_features() {
84 
85   if (g_cpu_features != UNDEFINED) {
86     return g_cpu_features;
87   } else {
88 #if defined(IS_X86)
89     uint32_t regs[4] = {0};
90     uint32_t *eax = &regs[0], *ebx = &regs[1], *ecx = &regs[2], *edx = &regs[3];
91     (void)edx;
92     enum cpu_feature features = 0;
93     cpuid(regs, 0);
94     const int max_id = *eax;
95     cpuid(regs, 1);
96 #if defined(__amd64__) || defined(_M_X64)
97     features |= SSE2;
98 #else
99     if (*edx & (1UL << 26))
100       features |= SSE2;
101 #endif
102     if (*ecx & (1UL << 0))
103       features |= SSSE3;
104     if (*ecx & (1UL << 19))
105       features |= SSE41;
106 
107     if (*ecx & (1UL << 27)) { // OSXSAVE
108       const uint64_t mask = xgetbv();
109       if ((mask & 6) == 6) { // SSE and AVX states
110         if (*ecx & (1UL << 28))
111           features |= AVX;
112         if (max_id >= 7) {
113           cpuidex(regs, 7, 0);
114           if (*ebx & (1UL << 5))
115             features |= AVX2;
116           if ((mask & 224) == 224) { // Opmask, ZMM_Hi256, Hi16_Zmm
117             if (*ebx & (1UL << 31))
118               features |= AVX512VL;
119             if (*ebx & (1UL << 16))
120               features |= AVX512F;
121           }
122         }
123       }
124     }
125     g_cpu_features = features;
126     return features;
127 #else
128     /* How to detect NEON? */
129     return 0;
130 #endif
131   }
132 }
133 
blake3_compress_in_place(uint32_t cv[8],const uint8_t block[BLAKE3_BLOCK_LEN],uint8_t block_len,uint64_t counter,uint8_t flags)134 void blake3_compress_in_place(uint32_t cv[8],
135                               const uint8_t block[BLAKE3_BLOCK_LEN],
136                               uint8_t block_len, uint64_t counter,
137                               uint8_t flags) {
138 #if defined(IS_X86)
139   const enum cpu_feature features = get_cpu_features();
140 #if !defined(BLAKE3_NO_AVX512)
141   if (features & AVX512VL) {
142     blake3_compress_in_place_avx512(cv, block, block_len, counter, flags);
143     return;
144   }
145 #endif
146 #if !defined(BLAKE3_NO_SSE41)
147   if (features & SSE41) {
148     blake3_compress_in_place_sse41(cv, block, block_len, counter, flags);
149     return;
150   }
151 #endif
152 #if !defined(BLAKE3_NO_SSE2)
153   if (features & SSE2) {
154     blake3_compress_in_place_sse2(cv, block, block_len, counter, flags);
155     return;
156   }
157 #endif
158 #endif
159   blake3_compress_in_place_portable(cv, block, block_len, counter, flags);
160 }
161 
blake3_compress_xof(const uint32_t cv[8],const uint8_t block[BLAKE3_BLOCK_LEN],uint8_t block_len,uint64_t counter,uint8_t flags,uint8_t out[64])162 void blake3_compress_xof(const uint32_t cv[8],
163                          const uint8_t block[BLAKE3_BLOCK_LEN],
164                          uint8_t block_len, uint64_t counter, uint8_t flags,
165                          uint8_t out[64]) {
166 #if defined(IS_X86)
167   const enum cpu_feature features = get_cpu_features();
168 #if !defined(BLAKE3_NO_AVX512)
169   if (features & AVX512VL) {
170     blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out);
171     return;
172   }
173 #endif
174 #if !defined(BLAKE3_NO_SSE41)
175   if (features & SSE41) {
176     blake3_compress_xof_sse41(cv, block, block_len, counter, flags, out);
177     return;
178   }
179 #endif
180 #if !defined(BLAKE3_NO_SSE2)
181   if (features & SSE2) {
182     blake3_compress_xof_sse2(cv, block, block_len, counter, flags, out);
183     return;
184   }
185 #endif
186 #endif
187   blake3_compress_xof_portable(cv, block, block_len, counter, flags, out);
188 }
189 
blake3_hash_many(const uint8_t * const * inputs,size_t num_inputs,size_t blocks,const uint32_t key[8],uint64_t counter,bool increment_counter,uint8_t flags,uint8_t flags_start,uint8_t flags_end,uint8_t * out)190 void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
191                       size_t blocks, const uint32_t key[8], uint64_t counter,
192                       bool increment_counter, uint8_t flags,
193                       uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
194 #if defined(IS_X86)
195   const enum cpu_feature features = get_cpu_features();
196 #if !defined(BLAKE3_NO_AVX512)
197   if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
198     blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter,
199                             increment_counter, flags, flags_start, flags_end,
200                             out);
201     return;
202   }
203 #endif
204 #if !defined(BLAKE3_NO_AVX2)
205   if (features & AVX2) {
206     blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter,
207                           increment_counter, flags, flags_start, flags_end,
208                           out);
209     return;
210   }
211 #endif
212 #if !defined(BLAKE3_NO_SSE41)
213   if (features & SSE41) {
214     blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter,
215                            increment_counter, flags, flags_start, flags_end,
216                            out);
217     return;
218   }
219 #endif
220 #if !defined(BLAKE3_NO_SSE2)
221   if (features & SSE2) {
222     blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter,
223                           increment_counter, flags, flags_start, flags_end,
224                           out);
225     return;
226   }
227 #endif
228 #endif
229 
230 #if defined(BLAKE3_USE_NEON)
231   blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter,
232                         increment_counter, flags, flags_start, flags_end, out);
233   return;
234 #endif
235 
236   blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter,
237                             increment_counter, flags, flags_start, flags_end,
238                             out);
239 }
240 
241 // The dynamically detected SIMD degree of the current platform.
blake3_simd_degree(void)242 size_t blake3_simd_degree(void) {
243 #if defined(IS_X86)
244   const enum cpu_feature features = get_cpu_features();
245 #if !defined(BLAKE3_NO_AVX512)
246   if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
247     return 16;
248   }
249 #endif
250 #if !defined(BLAKE3_NO_AVX2)
251   if (features & AVX2) {
252     return 8;
253   }
254 #endif
255 #if !defined(BLAKE3_NO_SSE41)
256   if (features & SSE41) {
257     return 4;
258   }
259 #endif
260 #if !defined(BLAKE3_NO_SSE2)
261   if (features & SSE2) {
262     return 4;
263   }
264 #endif
265 #endif
266 #if defined(BLAKE3_USE_NEON)
267   return 4;
268 #endif
269   return 1;
270 }
271