1 #include <stdbool.h>
2 #include <stddef.h>
3 #include <stdint.h>
4 #include <stdio.h>
5
6 #include "blake3_impl.h"
7
8 #if defined(IS_X86)
9 #if defined(_MSC_VER)
10 #include <intrin.h>
11 #elif defined(__GNUC__)
12 #include <immintrin.h>
13 #else
14 #error "Unimplemented!"
15 #endif
16 #endif
17
18 #if defined(IS_X86)
xgetbv()19 static uint64_t xgetbv() {
20 #if defined(_MSC_VER)
21 return _xgetbv(0);
22 #else
23 uint32_t eax = 0, edx = 0;
24 __asm__ __volatile__("xgetbv\n" : "=a"(eax), "=d"(edx) : "c"(0));
25 return ((uint64_t)edx << 32) | eax;
26 #endif
27 }
28
cpuid(uint32_t out[4],uint32_t id)29 static void cpuid(uint32_t out[4], uint32_t id) {
30 #if defined(_MSC_VER)
31 __cpuid((int *)out, id);
32 #elif defined(__i386__) || defined(_M_IX86)
33 __asm__ __volatile__("movl %%ebx, %1\n"
34 "cpuid\n"
35 "xchgl %1, %%ebx\n"
36 : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
37 : "a"(id));
38 #else
39 __asm__ __volatile__("cpuid\n"
40 : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
41 : "a"(id));
42 #endif
43 }
44
cpuidex(uint32_t out[4],uint32_t id,uint32_t sid)45 static void cpuidex(uint32_t out[4], uint32_t id, uint32_t sid) {
46 #if defined(_MSC_VER)
47 __cpuidex((int *)out, id, sid);
48 #elif defined(__i386__) || defined(_M_IX86)
49 __asm__ __volatile__("movl %%ebx, %1\n"
50 "cpuid\n"
51 "xchgl %1, %%ebx\n"
52 : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
53 : "a"(id), "c"(sid));
54 #else
55 __asm__ __volatile__("cpuid\n"
56 : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
57 : "a"(id), "c"(sid));
58 #endif
59 }
60
61 #endif
62
63 enum cpu_feature {
64 SSE2 = 1 << 0,
65 SSSE3 = 1 << 1,
66 SSE41 = 1 << 2,
67 AVX = 1 << 3,
68 AVX2 = 1 << 4,
69 AVX512F = 1 << 5,
70 AVX512VL = 1 << 6,
71 /* ... */
72 UNDEFINED = 1 << 30
73 };
74
75 #if !defined(BLAKE3_TESTING)
76 static /* Allow the variable to be controlled manually for testing */
77 #endif
78 enum cpu_feature g_cpu_features = UNDEFINED;
79
80 #if !defined(BLAKE3_TESTING)
81 static
82 #endif
83 enum cpu_feature
get_cpu_features()84 get_cpu_features() {
85
86 if (g_cpu_features != UNDEFINED) {
87 return g_cpu_features;
88 } else {
89 #if defined(IS_X86)
90 uint32_t regs[4] = {0};
91 uint32_t *eax = ®s[0], *ebx = ®s[1], *ecx = ®s[2], *edx = ®s[3];
92 (void)edx;
93 enum cpu_feature features = 0;
94 cpuid(regs, 0);
95 const int max_id = *eax;
96 cpuid(regs, 1);
97 #if defined(__amd64__) || defined(_M_X64)
98 features |= SSE2;
99 #else
100 if (*edx & (1UL << 26))
101 features |= SSE2;
102 #endif
103 if (*ecx & (1UL << 0))
104 features |= SSSE3;
105 if (*ecx & (1UL << 19))
106 features |= SSE41;
107
108 if (*ecx & (1UL << 27)) { // OSXSAVE
109 const uint64_t mask = xgetbv();
110 if ((mask & 6) == 6) { // SSE and AVX states
111 if (*ecx & (1UL << 28))
112 features |= AVX;
113 if (max_id >= 7) {
114 cpuidex(regs, 7, 0);
115 if (*ebx & (1UL << 5))
116 features |= AVX2;
117 if ((mask & 224) == 224) { // Opmask, ZMM_Hi256, Hi16_Zmm
118 if (*ebx & (1UL << 31))
119 features |= AVX512VL;
120 if (*ebx & (1UL << 16))
121 features |= AVX512F;
122 }
123 }
124 }
125 }
126 g_cpu_features = features;
127 return features;
128 #else
129 /* How to detect NEON? */
130 return 0;
131 #endif
132 }
133 }
134
blake3_compress_in_place(uint32_t cv[8],const uint8_t block[BLAKE3_BLOCK_LEN],uint8_t block_len,uint64_t counter,uint8_t flags)135 void blake3_compress_in_place(uint32_t cv[8],
136 const uint8_t block[BLAKE3_BLOCK_LEN],
137 uint8_t block_len, uint64_t counter,
138 uint8_t flags) {
139 #if defined(IS_X86)
140 const enum cpu_feature features = get_cpu_features();
141 #if !defined(BLAKE3_NO_AVX512)
142 if (features & AVX512VL) {
143 blake3_compress_in_place_avx512(cv, block, block_len, counter, flags);
144 return;
145 }
146 #endif
147 #if !defined(BLAKE3_NO_SSE41)
148 if (features & SSE41) {
149 blake3_compress_in_place_sse41(cv, block, block_len, counter, flags);
150 return;
151 }
152 #endif
153 #endif
154 blake3_compress_in_place_portable(cv, block, block_len, counter, flags);
155 }
156
blake3_compress_xof(const uint32_t cv[8],const uint8_t block[BLAKE3_BLOCK_LEN],uint8_t block_len,uint64_t counter,uint8_t flags,uint8_t out[64])157 void blake3_compress_xof(const uint32_t cv[8],
158 const uint8_t block[BLAKE3_BLOCK_LEN],
159 uint8_t block_len, uint64_t counter, uint8_t flags,
160 uint8_t out[64]) {
161 #if defined(IS_X86)
162 const enum cpu_feature features = get_cpu_features();
163 #if !defined(BLAKE3_NO_AVX512)
164 if (features & AVX512VL) {
165 blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out);
166 return;
167 }
168 #endif
169 #if !defined(BLAKE3_NO_SSE41)
170 if (features & SSE41) {
171 blake3_compress_xof_sse41(cv, block, block_len, counter, flags, out);
172 return;
173 }
174 #endif
175 #endif
176 blake3_compress_xof_portable(cv, block, block_len, counter, flags, out);
177 }
178
blake3_hash_many(const uint8_t * const * inputs,size_t num_inputs,size_t blocks,const uint32_t key[8],uint64_t counter,bool increment_counter,uint8_t flags,uint8_t flags_start,uint8_t flags_end,uint8_t * out)179 void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
180 size_t blocks, const uint32_t key[8], uint64_t counter,
181 bool increment_counter, uint8_t flags,
182 uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
183 #if defined(IS_X86)
184 const enum cpu_feature features = get_cpu_features();
185 #if !defined(BLAKE3_NO_AVX512)
186 if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
187 blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter,
188 increment_counter, flags, flags_start, flags_end,
189 out);
190 return;
191 }
192 #endif
193 #if !defined(BLAKE3_NO_AVX2)
194 if (features & AVX2) {
195 blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter,
196 increment_counter, flags, flags_start, flags_end,
197 out);
198 return;
199 }
200 #endif
201 #if !defined(BLAKE3_NO_SSE41)
202 if (features & SSE41) {
203 blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter,
204 increment_counter, flags, flags_start, flags_end,
205 out);
206 return;
207 }
208 #endif
209 #endif
210
211 #if defined(BLAKE3_USE_NEON)
212 blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter,
213 increment_counter, flags, flags_start, flags_end, out);
214 return;
215 #endif
216
217 blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter,
218 increment_counter, flags, flags_start, flags_end,
219 out);
220 }
221
222 // The dynamically detected SIMD degree of the current platform.
blake3_simd_degree(void)223 size_t blake3_simd_degree(void) {
224 #if defined(IS_X86)
225 const enum cpu_feature features = get_cpu_features();
226 #if !defined(BLAKE3_NO_AVX512)
227 if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
228 return 16;
229 }
230 #endif
231 #if !defined(BLAKE3_NO_AVX2)
232 if (features & AVX2) {
233 return 8;
234 }
235 #endif
236 #if !defined(BLAKE3_NO_SSE41)
237 if (features & SSE41) {
238 return 4;
239 }
240 #endif
241 #endif
242 #if defined(BLAKE3_USE_NEON)
243 return 4;
244 #endif
245 return 1;
246 }
247