1 #include <stdbool.h>
2 #include <stddef.h>
3 #include <stdint.h>
4 #include <stdio.h>
5 
6 #include "blake3_impl.h"
7 
8 #if defined(IS_X86)
9 #if defined(_MSC_VER)
10 #include <intrin.h>
11 #elif defined(__GNUC__)
12 #include <immintrin.h>
13 #else
14 #error "Unimplemented!"
15 #endif
16 #endif
17 
18 #if defined(IS_X86)
xgetbv()19 static uint64_t xgetbv() {
20 #if defined(_MSC_VER)
21   return _xgetbv(0);
22 #else
23   uint32_t eax = 0, edx = 0;
24   __asm__ __volatile__("xgetbv\n" : "=a"(eax), "=d"(edx) : "c"(0));
25   return ((uint64_t)edx << 32) | eax;
26 #endif
27 }
28 
cpuid(uint32_t out[4],uint32_t id)29 static void cpuid(uint32_t out[4], uint32_t id) {
30 #if defined(_MSC_VER)
31   __cpuid((int *)out, id);
32 #elif defined(__i386__) || defined(_M_IX86)
33   __asm__ __volatile__("movl %%ebx, %1\n"
34                        "cpuid\n"
35                        "xchgl %1, %%ebx\n"
36                        : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
37                        : "a"(id));
38 #else
39   __asm__ __volatile__("cpuid\n"
40                        : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
41                        : "a"(id));
42 #endif
43 }
44 
cpuidex(uint32_t out[4],uint32_t id,uint32_t sid)45 static void cpuidex(uint32_t out[4], uint32_t id, uint32_t sid) {
46 #if defined(_MSC_VER)
47   __cpuidex((int *)out, id, sid);
48 #elif defined(__i386__) || defined(_M_IX86)
49   __asm__ __volatile__("movl %%ebx, %1\n"
50                        "cpuid\n"
51                        "xchgl %1, %%ebx\n"
52                        : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
53                        : "a"(id), "c"(sid));
54 #else
55   __asm__ __volatile__("cpuid\n"
56                        : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
57                        : "a"(id), "c"(sid));
58 #endif
59 }
60 
61 #endif
62 
63 enum cpu_feature {
64   SSE2 = 1 << 0,
65   SSSE3 = 1 << 1,
66   SSE41 = 1 << 2,
67   AVX = 1 << 3,
68   AVX2 = 1 << 4,
69   AVX512F = 1 << 5,
70   AVX512VL = 1 << 6,
71   /* ... */
72   UNDEFINED = 1 << 30
73 };
74 
75 #if !defined(BLAKE3_TESTING)
76 static /* Allow the variable to be controlled manually for testing */
77 #endif
78     enum cpu_feature g_cpu_features = UNDEFINED;
79 
80 #if !defined(BLAKE3_TESTING)
81 static
82 #endif
83     enum cpu_feature
get_cpu_features()84     get_cpu_features() {
85 
86   if (g_cpu_features != UNDEFINED) {
87     return g_cpu_features;
88   } else {
89 #if defined(IS_X86)
90     uint32_t regs[4] = {0};
91     uint32_t *eax = &regs[0], *ebx = &regs[1], *ecx = &regs[2], *edx = &regs[3];
92     (void)edx;
93     enum cpu_feature features = 0;
94     cpuid(regs, 0);
95     const int max_id = *eax;
96     cpuid(regs, 1);
97 #if defined(__amd64__) || defined(_M_X64)
98     features |= SSE2;
99 #else
100     if (*edx & (1UL << 26))
101       features |= SSE2;
102 #endif
103     if (*ecx & (1UL << 0))
104       features |= SSSE3;
105     if (*ecx & (1UL << 19))
106       features |= SSE41;
107 
108     if (*ecx & (1UL << 27)) { // OSXSAVE
109       const uint64_t mask = xgetbv();
110       if ((mask & 6) == 6) { // SSE and AVX states
111         if (*ecx & (1UL << 28))
112           features |= AVX;
113         if (max_id >= 7) {
114           cpuidex(regs, 7, 0);
115           if (*ebx & (1UL << 5))
116             features |= AVX2;
117           if ((mask & 224) == 224) { // Opmask, ZMM_Hi256, Hi16_Zmm
118             if (*ebx & (1UL << 31))
119               features |= AVX512VL;
120             if (*ebx & (1UL << 16))
121               features |= AVX512F;
122           }
123         }
124       }
125     }
126     g_cpu_features = features;
127     return features;
128 #else
129     /* How to detect NEON? */
130     return 0;
131 #endif
132   }
133 }
134 
blake3_compress_in_place(uint32_t cv[8],const uint8_t block[BLAKE3_BLOCK_LEN],uint8_t block_len,uint64_t counter,uint8_t flags)135 void blake3_compress_in_place(uint32_t cv[8],
136                               const uint8_t block[BLAKE3_BLOCK_LEN],
137                               uint8_t block_len, uint64_t counter,
138                               uint8_t flags) {
139 #if defined(IS_X86)
140   const enum cpu_feature features = get_cpu_features();
141 #if !defined(BLAKE3_NO_AVX512)
142   if (features & AVX512VL) {
143     blake3_compress_in_place_avx512(cv, block, block_len, counter, flags);
144     return;
145   }
146 #endif
147 #if !defined(BLAKE3_NO_SSE41)
148   if (features & SSE41) {
149     blake3_compress_in_place_sse41(cv, block, block_len, counter, flags);
150     return;
151   }
152 #endif
153 #endif
154   blake3_compress_in_place_portable(cv, block, block_len, counter, flags);
155 }
156 
blake3_compress_xof(const uint32_t cv[8],const uint8_t block[BLAKE3_BLOCK_LEN],uint8_t block_len,uint64_t counter,uint8_t flags,uint8_t out[64])157 void blake3_compress_xof(const uint32_t cv[8],
158                          const uint8_t block[BLAKE3_BLOCK_LEN],
159                          uint8_t block_len, uint64_t counter, uint8_t flags,
160                          uint8_t out[64]) {
161 #if defined(IS_X86)
162   const enum cpu_feature features = get_cpu_features();
163 #if !defined(BLAKE3_NO_AVX512)
164   if (features & AVX512VL) {
165     blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out);
166     return;
167   }
168 #endif
169 #if !defined(BLAKE3_NO_SSE41)
170   if (features & SSE41) {
171     blake3_compress_xof_sse41(cv, block, block_len, counter, flags, out);
172     return;
173   }
174 #endif
175 #endif
176   blake3_compress_xof_portable(cv, block, block_len, counter, flags, out);
177 }
178 
blake3_hash_many(const uint8_t * const * inputs,size_t num_inputs,size_t blocks,const uint32_t key[8],uint64_t counter,bool increment_counter,uint8_t flags,uint8_t flags_start,uint8_t flags_end,uint8_t * out)179 void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
180                       size_t blocks, const uint32_t key[8], uint64_t counter,
181                       bool increment_counter, uint8_t flags,
182                       uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
183 #if defined(IS_X86)
184   const enum cpu_feature features = get_cpu_features();
185 #if !defined(BLAKE3_NO_AVX512)
186   if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
187     blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter,
188                             increment_counter, flags, flags_start, flags_end,
189                             out);
190     return;
191   }
192 #endif
193 #if !defined(BLAKE3_NO_AVX2)
194   if (features & AVX2) {
195     blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter,
196                           increment_counter, flags, flags_start, flags_end,
197                           out);
198     return;
199   }
200 #endif
201 #if !defined(BLAKE3_NO_SSE41)
202   if (features & SSE41) {
203     blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter,
204                            increment_counter, flags, flags_start, flags_end,
205                            out);
206     return;
207   }
208 #endif
209 #endif
210 
211 #if defined(BLAKE3_USE_NEON)
212   blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter,
213                         increment_counter, flags, flags_start, flags_end, out);
214   return;
215 #endif
216 
217   blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter,
218                             increment_counter, flags, flags_start, flags_end,
219                             out);
220 }
221 
222 // The dynamically detected SIMD degree of the current platform.
blake3_simd_degree(void)223 size_t blake3_simd_degree(void) {
224 #if defined(IS_X86)
225   const enum cpu_feature features = get_cpu_features();
226 #if !defined(BLAKE3_NO_AVX512)
227   if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
228     return 16;
229   }
230 #endif
231 #if !defined(BLAKE3_NO_AVX2)
232   if (features & AVX2) {
233     return 8;
234   }
235 #endif
236 #if !defined(BLAKE3_NO_SSE41)
237   if (features & SSE41) {
238     return 4;
239   }
240 #endif
241 #endif
242 #if defined(BLAKE3_USE_NEON)
243   return 4;
244 #endif
245   return 1;
246 }
247