1 #include <stdbool.h>
2 #include <stddef.h>
3 #include <stdint.h>
4
5 #include "blake3_impl.h"
6
7 #if defined(IS_X86)
8 #if defined(_MSC_VER)
9 #include <intrin.h>
10 #elif defined(__GNUC__)
11 #include <immintrin.h>
12 #else
13 #error "Unimplemented!"
14 #endif
15 #endif
16
17 #if defined(IS_X86)
xgetbv()18 static uint64_t xgetbv() {
19 #if defined(_MSC_VER)
20 return _xgetbv(0);
21 #else
22 uint32_t eax = 0, edx = 0;
23 __asm__ __volatile__("xgetbv\n" : "=a"(eax), "=d"(edx) : "c"(0));
24 return ((uint64_t)edx << 32) | eax;
25 #endif
26 }
27
cpuid(uint32_t out[4],uint32_t id)28 static void cpuid(uint32_t out[4], uint32_t id) {
29 #if defined(_MSC_VER)
30 __cpuid((int *)out, id);
31 #elif defined(__i386__) || defined(_M_IX86)
32 __asm__ __volatile__("movl %%ebx, %1\n"
33 "cpuid\n"
34 "xchgl %1, %%ebx\n"
35 : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
36 : "a"(id));
37 #else
38 __asm__ __volatile__("cpuid\n"
39 : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
40 : "a"(id));
41 #endif
42 }
43
cpuidex(uint32_t out[4],uint32_t id,uint32_t sid)44 static void cpuidex(uint32_t out[4], uint32_t id, uint32_t sid) {
45 #if defined(_MSC_VER)
46 __cpuidex((int *)out, id, sid);
47 #elif defined(__i386__) || defined(_M_IX86)
48 __asm__ __volatile__("movl %%ebx, %1\n"
49 "cpuid\n"
50 "xchgl %1, %%ebx\n"
51 : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
52 : "a"(id), "c"(sid));
53 #else
54 __asm__ __volatile__("cpuid\n"
55 : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
56 : "a"(id), "c"(sid));
57 #endif
58 }
59
60 #endif
61
62 enum cpu_feature {
63 SSE2 = 1 << 0,
64 SSSE3 = 1 << 1,
65 SSE41 = 1 << 2,
66 AVX = 1 << 3,
67 AVX2 = 1 << 4,
68 AVX512F = 1 << 5,
69 AVX512VL = 1 << 6,
70 /* ... */
71 UNDEFINED = 1 << 30
72 };
73
74 #if !defined(BLAKE3_TESTING)
75 static /* Allow the variable to be controlled manually for testing */
76 #endif
77 enum cpu_feature g_cpu_features = UNDEFINED;
78
79 #if !defined(BLAKE3_TESTING)
80 static
81 #endif
82 enum cpu_feature
get_cpu_features()83 get_cpu_features() {
84
85 if (g_cpu_features != UNDEFINED) {
86 return g_cpu_features;
87 } else {
88 #if defined(IS_X86)
89 uint32_t regs[4] = {0};
90 uint32_t *eax = ®s[0], *ebx = ®s[1], *ecx = ®s[2], *edx = ®s[3];
91 (void)edx;
92 enum cpu_feature features = 0;
93 cpuid(regs, 0);
94 const int max_id = *eax;
95 cpuid(regs, 1);
96 #if defined(__amd64__) || defined(_M_X64)
97 features |= SSE2;
98 #else
99 if (*edx & (1UL << 26))
100 features |= SSE2;
101 #endif
102 if (*ecx & (1UL << 0))
103 features |= SSSE3;
104 if (*ecx & (1UL << 19))
105 features |= SSE41;
106
107 if (*ecx & (1UL << 27)) { // OSXSAVE
108 const uint64_t mask = xgetbv();
109 if ((mask & 6) == 6) { // SSE and AVX states
110 if (*ecx & (1UL << 28))
111 features |= AVX;
112 if (max_id >= 7) {
113 cpuidex(regs, 7, 0);
114 if (*ebx & (1UL << 5))
115 features |= AVX2;
116 if ((mask & 224) == 224) { // Opmask, ZMM_Hi256, Hi16_Zmm
117 if (*ebx & (1UL << 31))
118 features |= AVX512VL;
119 if (*ebx & (1UL << 16))
120 features |= AVX512F;
121 }
122 }
123 }
124 }
125 g_cpu_features = features;
126 return features;
127 #else
128 /* How to detect NEON? */
129 return 0;
130 #endif
131 }
132 }
133
blake3_compress_in_place(uint32_t cv[8],const uint8_t block[BLAKE3_BLOCK_LEN],uint8_t block_len,uint64_t counter,uint8_t flags)134 void blake3_compress_in_place(uint32_t cv[8],
135 const uint8_t block[BLAKE3_BLOCK_LEN],
136 uint8_t block_len, uint64_t counter,
137 uint8_t flags) {
138 #if defined(IS_X86)
139 const enum cpu_feature features = get_cpu_features();
140 #if !defined(BLAKE3_NO_AVX512)
141 if (features & AVX512VL) {
142 blake3_compress_in_place_avx512(cv, block, block_len, counter, flags);
143 return;
144 }
145 #endif
146 #if !defined(BLAKE3_NO_SSE41)
147 if (features & SSE41) {
148 blake3_compress_in_place_sse41(cv, block, block_len, counter, flags);
149 return;
150 }
151 #endif
152 #if !defined(BLAKE3_NO_SSE2)
153 if (features & SSE2) {
154 blake3_compress_in_place_sse2(cv, block, block_len, counter, flags);
155 return;
156 }
157 #endif
158 #endif
159 blake3_compress_in_place_portable(cv, block, block_len, counter, flags);
160 }
161
blake3_compress_xof(const uint32_t cv[8],const uint8_t block[BLAKE3_BLOCK_LEN],uint8_t block_len,uint64_t counter,uint8_t flags,uint8_t out[64])162 void blake3_compress_xof(const uint32_t cv[8],
163 const uint8_t block[BLAKE3_BLOCK_LEN],
164 uint8_t block_len, uint64_t counter, uint8_t flags,
165 uint8_t out[64]) {
166 #if defined(IS_X86)
167 const enum cpu_feature features = get_cpu_features();
168 #if !defined(BLAKE3_NO_AVX512)
169 if (features & AVX512VL) {
170 blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out);
171 return;
172 }
173 #endif
174 #if !defined(BLAKE3_NO_SSE41)
175 if (features & SSE41) {
176 blake3_compress_xof_sse41(cv, block, block_len, counter, flags, out);
177 return;
178 }
179 #endif
180 #if !defined(BLAKE3_NO_SSE2)
181 if (features & SSE2) {
182 blake3_compress_xof_sse2(cv, block, block_len, counter, flags, out);
183 return;
184 }
185 #endif
186 #endif
187 blake3_compress_xof_portable(cv, block, block_len, counter, flags, out);
188 }
189
blake3_hash_many(const uint8_t * const * inputs,size_t num_inputs,size_t blocks,const uint32_t key[8],uint64_t counter,bool increment_counter,uint8_t flags,uint8_t flags_start,uint8_t flags_end,uint8_t * out)190 void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
191 size_t blocks, const uint32_t key[8], uint64_t counter,
192 bool increment_counter, uint8_t flags,
193 uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
194 #if defined(IS_X86)
195 const enum cpu_feature features = get_cpu_features();
196 #if !defined(BLAKE3_NO_AVX512)
197 if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
198 blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter,
199 increment_counter, flags, flags_start, flags_end,
200 out);
201 return;
202 }
203 #endif
204 #if !defined(BLAKE3_NO_AVX2)
205 if (features & AVX2) {
206 blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter,
207 increment_counter, flags, flags_start, flags_end,
208 out);
209 return;
210 }
211 #endif
212 #if !defined(BLAKE3_NO_SSE41)
213 if (features & SSE41) {
214 blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter,
215 increment_counter, flags, flags_start, flags_end,
216 out);
217 return;
218 }
219 #endif
220 #if !defined(BLAKE3_NO_SSE2)
221 if (features & SSE2) {
222 blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter,
223 increment_counter, flags, flags_start, flags_end,
224 out);
225 return;
226 }
227 #endif
228 #endif
229
230 #if defined(BLAKE3_USE_NEON)
231 blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter,
232 increment_counter, flags, flags_start, flags_end, out);
233 return;
234 #endif
235
236 blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter,
237 increment_counter, flags, flags_start, flags_end,
238 out);
239 }
240
241 // The dynamically detected SIMD degree of the current platform.
blake3_simd_degree(void)242 size_t blake3_simd_degree(void) {
243 #if defined(IS_X86)
244 const enum cpu_feature features = get_cpu_features();
245 #if !defined(BLAKE3_NO_AVX512)
246 if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
247 return 16;
248 }
249 #endif
250 #if !defined(BLAKE3_NO_AVX2)
251 if (features & AVX2) {
252 return 8;
253 }
254 #endif
255 #if !defined(BLAKE3_NO_SSE41)
256 if (features & SSE41) {
257 return 4;
258 }
259 #endif
260 #if !defined(BLAKE3_NO_SSE2)
261 if (features & SSE2) {
262 return 4;
263 }
264 #endif
265 #endif
266 #if defined(BLAKE3_USE_NEON)
267 return 4;
268 #endif
269 return 1;
270 }
271