1 /*
2 BLAKE2 reference source code package - reference C implementations
3
4 Copyright 2012, Samuel Neves <sneves@dei.uc.pt>. You may use this under the
5 terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at
6 your option. The terms of these licenses can be found at:
7
8 - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
9 - OpenSSL license : https://www.openssl.org/source/license.html
10 - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0
11
12 More information about the BLAKE2 hash function can be found at
13 https://blake2.net.
14 */
15
16 #include <stdint.h>
17 #include <string.h>
18 #include <stdio.h>
19 #include <arm_neon.h>
20
21 #include "blake2.h"
22 #include "blake2-impl.h"
23
24 static const uint64_t blake2b_IV[8] =
25 {
26 0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL,
27 0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL,
28 0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
29 0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
30 };
31
32 /*
33 static const uint8_t blake2b_sigma[12][16] =
34 {
35 { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } ,
36 { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } ,
37 { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } ,
38 { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } ,
39 { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 } ,
40 { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } ,
41 { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 } ,
42 { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 } ,
43 { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 } ,
44 { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 } ,
45 { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } ,
46 { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
47 };
48 */
49
blake2b_set_lastnode(blake2b_state * S)50 static void blake2b_set_lastnode( blake2b_state *S )
51 {
52 S->f[1] = (uint64_t)-1;
53 }
54
55 /* Some helper functions, not necessarily useful */
blake2b_is_lastblock(const blake2b_state * S)56 static int blake2b_is_lastblock( const blake2b_state *S )
57 {
58 return S->f[0] != 0;
59 }
60
blake2b_set_lastblock(blake2b_state * S)61 static void blake2b_set_lastblock( blake2b_state *S )
62 {
63 if( S->last_node ) blake2b_set_lastnode( S );
64
65 S->f[0] = (uint64_t)-1;
66 }
67
blake2b_increment_counter(blake2b_state * S,const uint64_t inc)68 static void blake2b_increment_counter( blake2b_state *S, const uint64_t inc )
69 {
70 S->t[0] += inc;
71 S->t[1] += ( S->t[0] < inc );
72 }
73
blake2b_init0(blake2b_state * S)74 static void blake2b_init0( blake2b_state *S )
75 {
76 size_t i;
77 memset( S, 0, sizeof( blake2b_state ) );
78
79 for( i = 0; i < 8; ++i ) S->h[i] = blake2b_IV[i];
80 }
81
82 /* init xors IV with input parameter block */
blake2b_init_param(blake2b_state * S,const blake2b_param * P)83 int blake2b_init_param( blake2b_state *S, const blake2b_param *P )
84 {
85 const uint8_t *p = ( const uint8_t * )( P );
86 size_t i;
87
88 blake2b_init0( S );
89
90 /* IV XOR ParamBlock */
91 for( i = 0; i < 8; ++i )
92 S->h[i] ^= load64( p + sizeof( S->h[i] ) * i );
93
94 S->outlen = P->digest_length;
95 return 0;
96 }
97
98
99
blake2b_init(blake2b_state * S,size_t outlen)100 int blake2b_init( blake2b_state *S, size_t outlen )
101 {
102 blake2b_param P[1];
103
104 if ( ( !outlen ) || ( outlen > BLAKE2B_OUTBYTES ) ) return -1;
105
106 P->digest_length = (uint8_t)outlen;
107 P->key_length = 0;
108 P->fanout = 1;
109 P->depth = 1;
110 store32( &P->leaf_length, 0 );
111 store32( &P->node_offset, 0 );
112 store32( &P->xof_length, 0 );
113 P->node_depth = 0;
114 P->inner_length = 0;
115 memset( P->reserved, 0, sizeof( P->reserved ) );
116 memset( P->salt, 0, sizeof( P->salt ) );
117 memset( P->personal, 0, sizeof( P->personal ) );
118 return blake2b_init_param( S, P );
119 }
120
121
blake2b_init_key(blake2b_state * S,size_t outlen,const void * key,size_t keylen)122 int blake2b_init_key( blake2b_state *S, size_t outlen, const void *key, size_t keylen )
123 {
124 blake2b_param P[1];
125
126 if ( ( !outlen ) || ( outlen > BLAKE2B_OUTBYTES ) ) return -1;
127
128 if ( !key || !keylen || keylen > BLAKE2B_KEYBYTES ) return -1;
129
130 P->digest_length = (uint8_t)outlen;
131 P->key_length = (uint8_t)keylen;
132 P->fanout = 1;
133 P->depth = 1;
134 store32( &P->leaf_length, 0 );
135 store32( &P->node_offset, 0 );
136 store32( &P->xof_length, 0 );
137 P->node_depth = 0;
138 P->inner_length = 0;
139 memset( P->reserved, 0, sizeof( P->reserved ) );
140 memset( P->salt, 0, sizeof( P->salt ) );
141 memset( P->personal, 0, sizeof( P->personal ) );
142
143 if( blake2b_init_param( S, P ) < 0 ) return -1;
144
145 {
146 uint8_t block[BLAKE2B_BLOCKBYTES];
147 memset( block, 0, BLAKE2B_BLOCKBYTES );
148 memcpy( block, key, keylen );
149 blake2b_update( S, block, BLAKE2B_BLOCKBYTES );
150 secure_zero_memory( block, BLAKE2B_BLOCKBYTES ); /* Burn the key from stack */
151 }
152 return 0;
153 }
154
155 #undef LOAD_MSG_0_1
156 #define LOAD_MSG_0_1(b0, b1) \
157 do { b0 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m1)); b1 = vcombine_u64(vget_low_u64(m2), vget_low_u64(m3)); } while(0)
158
159 #undef LOAD_MSG_0_2
160 #define LOAD_MSG_0_2(b0, b1) \
161 do { b0 = vcombine_u64(vget_high_u64(m0), vget_high_u64(m1)); b1 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m3)); } while(0)
162
163 #undef LOAD_MSG_0_3
164 #define LOAD_MSG_0_3(b0, b1) \
165 do { b0 = vcombine_u64(vget_low_u64(m4), vget_low_u64(m5)); b1 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m7)); } while(0)
166
167 #undef LOAD_MSG_0_4
168 #define LOAD_MSG_0_4(b0, b1) \
169 do { b0 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m5)); b1 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m7)); } while(0)
170
171 #undef LOAD_MSG_1_1
172 #define LOAD_MSG_1_1(b0, b1) \
173 do { b0 = vcombine_u64(vget_low_u64(m7), vget_low_u64(m2)); b1 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m6)); } while(0)
174
175 #undef LOAD_MSG_1_2
176 #define LOAD_MSG_1_2(b0, b1) \
177 do { b0 = vcombine_u64(vget_low_u64(m5), vget_low_u64(m4)); b1 = vextq_u64(m7, m3, 1); } while(0)
178
179 #undef LOAD_MSG_1_3
180 #define LOAD_MSG_1_3(b0, b1) \
181 do { b0 = vextq_u64(m0, m0, 1); b1 = vcombine_u64(vget_high_u64(m5), vget_high_u64(m2)); } while(0)
182
183 #undef LOAD_MSG_1_4
184 #define LOAD_MSG_1_4(b0, b1) \
185 do { b0 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m1)); b1 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m1)); } while(0)
186
187 #undef LOAD_MSG_2_1
188 #define LOAD_MSG_2_1(b0, b1) \
189 do { b0 = vextq_u64(m5, m6, 1); b1 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m7)); } while(0)
190
191 #undef LOAD_MSG_2_2
192 #define LOAD_MSG_2_2(b0, b1) \
193 do { b0 = vcombine_u64(vget_low_u64(m4), vget_low_u64(m0)); b1 = vcombine_u64(vget_low_u64(m1), vget_high_u64(m6)); } while(0)
194
195 #undef LOAD_MSG_2_3
196 #define LOAD_MSG_2_3(b0, b1) \
197 do { b0 = vcombine_u64(vget_low_u64(m5), vget_high_u64(m1)); b1 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m4)); } while(0)
198
199 #undef LOAD_MSG_2_4
200 #define LOAD_MSG_2_4(b0, b1) \
201 do { b0 = vcombine_u64(vget_low_u64(m7), vget_low_u64(m3)); b1 = vextq_u64(m0, m2, 1); } while(0)
202
203 #undef LOAD_MSG_3_1
204 #define LOAD_MSG_3_1(b0, b1) \
205 do { b0 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m1)); b1 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m5)); } while(0)
206
207 #undef LOAD_MSG_3_2
208 #define LOAD_MSG_3_2(b0, b1) \
209 do { b0 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m0)); b1 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m7)); } while(0)
210
211 #undef LOAD_MSG_3_3
212 #define LOAD_MSG_3_3(b0, b1) \
213 do { b0 = vcombine_u64(vget_low_u64(m1), vget_high_u64(m2)); b1 = vcombine_u64(vget_low_u64(m2), vget_high_u64(m7)); } while(0)
214
215 #undef LOAD_MSG_3_4
216 #define LOAD_MSG_3_4(b0, b1) \
217 do { b0 = vcombine_u64(vget_low_u64(m3), vget_low_u64(m5)); b1 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m4)); } while(0)
218
219 #undef LOAD_MSG_4_1
220 #define LOAD_MSG_4_1(b0, b1) \
221 do { b0 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m2)); b1 = vcombine_u64(vget_low_u64(m1), vget_low_u64(m5)); } while(0)
222
223 #undef LOAD_MSG_4_2
224 #define LOAD_MSG_4_2(b0, b1) \
225 do { b0 = vcombine_u64(vget_low_u64(m0), vget_high_u64(m3)); b1 = vcombine_u64(vget_low_u64(m2), vget_high_u64(m7)); } while(0)
226
227 #undef LOAD_MSG_4_3
228 #define LOAD_MSG_4_3(b0, b1) \
229 do { b0 = vcombine_u64(vget_low_u64(m7), vget_high_u64(m5)); b1 = vcombine_u64(vget_low_u64(m3), vget_high_u64(m1)); } while(0)
230
231 #undef LOAD_MSG_4_4
232 #define LOAD_MSG_4_4(b0, b1) \
233 do { b0 = vextq_u64(m0, m6, 1); b1 = vcombine_u64(vget_low_u64(m4), vget_high_u64(m6)); } while(0)
234
235 #undef LOAD_MSG_5_1
236 #define LOAD_MSG_5_1(b0, b1) \
237 do { b0 = vcombine_u64(vget_low_u64(m1), vget_low_u64(m3)); b1 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m4)); } while(0)
238
239 #undef LOAD_MSG_5_2
240 #define LOAD_MSG_5_2(b0, b1) \
241 do { b0 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m5)); b1 = vcombine_u64(vget_high_u64(m5), vget_high_u64(m1)); } while(0)
242
243 #undef LOAD_MSG_5_3
244 #define LOAD_MSG_5_3(b0, b1) \
245 do { b0 = vcombine_u64(vget_low_u64(m2), vget_high_u64(m3)); b1 = vcombine_u64(vget_high_u64(m7), vget_high_u64(m0)); } while(0)
246
247 #undef LOAD_MSG_5_4
248 #define LOAD_MSG_5_4(b0, b1) \
249 do { b0 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m2)); b1 = vcombine_u64(vget_low_u64(m7), vget_high_u64(m4)); } while(0)
250
251 #undef LOAD_MSG_6_1
252 #define LOAD_MSG_6_1(b0, b1) \
253 do { b0 = vcombine_u64(vget_low_u64(m6), vget_high_u64(m0)); b1 = vcombine_u64(vget_low_u64(m7), vget_low_u64(m2)); } while(0)
254
255 #undef LOAD_MSG_6_2
256 #define LOAD_MSG_6_2(b0, b1) \
257 do { b0 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m7)); b1 = vextq_u64(m6, m5, 1); } while(0)
258
259 #undef LOAD_MSG_6_3
260 #define LOAD_MSG_6_3(b0, b1) \
261 do { b0 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m3)); b1 = vextq_u64(m4, m4, 1); } while(0)
262
263 #undef LOAD_MSG_6_4
264 #define LOAD_MSG_6_4(b0, b1) \
265 do { b0 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m1)); b1 = vcombine_u64(vget_low_u64(m1), vget_high_u64(m5)); } while(0)
266
267 #undef LOAD_MSG_7_1
268 #define LOAD_MSG_7_1(b0, b1) \
269 do { b0 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m3)); b1 = vcombine_u64(vget_low_u64(m6), vget_high_u64(m1)); } while(0)
270
271 #undef LOAD_MSG_7_2
272 #define LOAD_MSG_7_2(b0, b1) \
273 do { b0 = vextq_u64(m5, m7, 1); b1 = vcombine_u64(vget_high_u64(m0), vget_high_u64(m4)); } while(0)
274
275 #undef LOAD_MSG_7_3
276 #define LOAD_MSG_7_3(b0, b1) \
277 do { b0 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m7)); b1 = vcombine_u64(vget_low_u64(m4), vget_low_u64(m1)); } while(0)
278
279 #undef LOAD_MSG_7_4
280 #define LOAD_MSG_7_4(b0, b1) \
281 do { b0 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m2)); b1 = vcombine_u64(vget_low_u64(m3), vget_low_u64(m5)); } while(0)
282
283 #undef LOAD_MSG_8_1
284 #define LOAD_MSG_8_1(b0, b1) \
285 do { b0 = vcombine_u64(vget_low_u64(m3), vget_low_u64(m7)); b1 = vextq_u64(m5, m0, 1); } while(0)
286
287 #undef LOAD_MSG_8_2
288 #define LOAD_MSG_8_2(b0, b1) \
289 do { b0 = vcombine_u64(vget_high_u64(m7), vget_high_u64(m4)); b1 = vextq_u64(m1, m4, 1); } while(0)
290
291 #undef LOAD_MSG_8_3
292 #define LOAD_MSG_8_3(b0, b1) \
293 do { b0 = m6; b1 = vextq_u64(m0, m5, 1); } while(0)
294
295 #undef LOAD_MSG_8_4
296 #define LOAD_MSG_8_4(b0, b1) \
297 do { b0 = vcombine_u64(vget_low_u64(m1), vget_high_u64(m3)); b1 = m2; } while(0)
298
299 #undef LOAD_MSG_9_1
300 #define LOAD_MSG_9_1(b0, b1) \
301 do { b0 = vcombine_u64(vget_low_u64(m5), vget_low_u64(m4)); b1 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m0)); } while(0)
302
303 #undef LOAD_MSG_9_2
304 #define LOAD_MSG_9_2(b0, b1) \
305 do { b0 = vcombine_u64(vget_low_u64(m1), vget_low_u64(m2)); b1 = vcombine_u64(vget_low_u64(m3), vget_high_u64(m2)); } while(0)
306
307 #undef LOAD_MSG_9_3
308 #define LOAD_MSG_9_3(b0, b1) \
309 do { b0 = vcombine_u64(vget_high_u64(m7), vget_high_u64(m4)); b1 = vcombine_u64(vget_high_u64(m1), vget_high_u64(m6)); } while(0)
310
311 #undef LOAD_MSG_9_4
312 #define LOAD_MSG_9_4(b0, b1) \
313 do { b0 = vextq_u64(m5, m7, 1); b1 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m0)); } while(0)
314
315 #undef LOAD_MSG_10_1
316 #define LOAD_MSG_10_1(b0, b1) \
317 do { b0 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m1)); b1 = vcombine_u64(vget_low_u64(m2), vget_low_u64(m3)); } while(0)
318
319 #undef LOAD_MSG_10_2
320 #define LOAD_MSG_10_2(b0, b1) \
321 do { b0 = vcombine_u64(vget_high_u64(m0), vget_high_u64(m1)); b1 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m3)); } while(0)
322
323 #undef LOAD_MSG_10_3
324 #define LOAD_MSG_10_3(b0, b1) \
325 do { b0 = vcombine_u64(vget_low_u64(m4), vget_low_u64(m5)); b1 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m7)); } while(0)
326
327 #undef LOAD_MSG_10_4
328 #define LOAD_MSG_10_4(b0, b1) \
329 do { b0 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m5)); b1 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m7)); } while(0)
330
331 #undef LOAD_MSG_11_1
332 #define LOAD_MSG_11_1(b0, b1) \
333 do { b0 = vcombine_u64(vget_low_u64(m7), vget_low_u64(m2)); b1 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m6)); } while(0)
334
335 #undef LOAD_MSG_11_2
336 #define LOAD_MSG_11_2(b0, b1) \
337 do { b0 = vcombine_u64(vget_low_u64(m5), vget_low_u64(m4)); b1 = vextq_u64(m7, m3, 1); } while(0)
338
339 #undef LOAD_MSG_11_3
340 #define LOAD_MSG_11_3(b0, b1) \
341 do { b0 = vextq_u64(m0, m0, 1); b1 = vcombine_u64(vget_high_u64(m5), vget_high_u64(m2)); } while(0)
342
343 #undef LOAD_MSG_11_4
344 #define LOAD_MSG_11_4(b0, b1) \
345 do { b0 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m1)); b1 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m1)); } while(0)
346
347 #define vrorq_n_u64_32(x) vreinterpretq_u64_u32(vrev64q_u32(vreinterpretq_u32_u64((x))))
348
349 #define vrorq_n_u64_24(x) vcombine_u64( \
350 vreinterpret_u64_u8(vext_u8(vreinterpret_u8_u64(vget_low_u64(x)), vreinterpret_u8_u64(vget_low_u64(x)), 3)), \
351 vreinterpret_u64_u8(vext_u8(vreinterpret_u8_u64(vget_high_u64(x)), vreinterpret_u8_u64(vget_high_u64(x)), 3)))
352
353 #define vrorq_n_u64_16(x) vcombine_u64( \
354 vreinterpret_u64_u8(vext_u8(vreinterpret_u8_u64(vget_low_u64(x)), vreinterpret_u8_u64(vget_low_u64(x)), 2)), \
355 vreinterpret_u64_u8(vext_u8(vreinterpret_u8_u64(vget_high_u64(x)), vreinterpret_u8_u64(vget_high_u64(x)), 2)))
356
357 #define vrorq_n_u64_63(x) veorq_u64(vaddq_u64(x, x), vshrq_n_u64(x, 63))
358
359 #undef G1
360 #define G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
361 do { \
362 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l); \
363 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h); \
364 row4l = veorq_u64(row4l, row1l); row4h = veorq_u64(row4h, row1h); \
365 row4l = vrorq_n_u64_32(row4l); row4h = vrorq_n_u64_32(row4h); \
366 row3l = vaddq_u64(row3l, row4l); row3h = vaddq_u64(row3h, row4h); \
367 row2l = veorq_u64(row2l, row3l); row2h = veorq_u64(row2h, row3h); \
368 row2l = vrorq_n_u64_24(row2l); row2h = vrorq_n_u64_24(row2h); \
369 } while(0)
370
371 #undef G2
372 #define G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
373 do { \
374 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l); \
375 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h); \
376 row4l = veorq_u64(row4l, row1l); row4h = veorq_u64(row4h, row1h); \
377 row4l = vrorq_n_u64_16(row4l); row4h = vrorq_n_u64_16(row4h); \
378 row3l = vaddq_u64(row3l, row4l); row3h = vaddq_u64(row3h, row4h); \
379 row2l = veorq_u64(row2l, row3l); row2h = veorq_u64(row2h, row3h); \
380 row2l = vrorq_n_u64_63(row2l); row2h = vrorq_n_u64_63(row2h); \
381 } while(0)
382
383 #define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
384 do { \
385 uint64x2_t t0 = vextq_u64(row2l, row2h, 1); \
386 uint64x2_t t1 = vextq_u64(row2h, row2l, 1); \
387 row2l = t0; row2h = t1; t0 = row3l; row3l = row3h; row3h = t0; \
388 t0 = vextq_u64(row4h, row4l, 1); t1 = vextq_u64(row4l, row4h, 1); \
389 row4l = t0; row4h = t1; \
390 } while(0)
391
392 #define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
393 do { \
394 uint64x2_t t0 = vextq_u64(row2h, row2l, 1); \
395 uint64x2_t t1 = vextq_u64(row2l, row2h, 1); \
396 row2l = t0; row2h = t1; t0 = row3l; row3l = row3h; row3h = t0; \
397 t0 = vextq_u64(row4l, row4h, 1); t1 = vextq_u64(row4h, row4l, 1); \
398 row4l = t0; row4h = t1; \
399 } while(0)
400
401 #undef ROUND
402 #define ROUND(r) \
403 do { \
404 uint64x2_t b0, b1; \
405 LOAD_MSG_ ##r ##_1(b0, b1); \
406 G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
407 LOAD_MSG_ ##r ##_2(b0, b1); \
408 G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
409 DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
410 LOAD_MSG_ ##r ##_3(b0, b1); \
411 G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
412 LOAD_MSG_ ##r ##_4(b0, b1); \
413 G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
414 UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
415 } while(0)
416
blake2b_compress(blake2b_state * S,const uint8_t block[BLAKE2B_BLOCKBYTES])417 static void blake2b_compress( blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES] )
418 {
419 const uint64x2_t m0 = vreinterpretq_u64_u8(vld1q_u8(&block[ 0]));
420 const uint64x2_t m1 = vreinterpretq_u64_u8(vld1q_u8(&block[ 16]));
421 const uint64x2_t m2 = vreinterpretq_u64_u8(vld1q_u8(&block[ 32]));
422 const uint64x2_t m3 = vreinterpretq_u64_u8(vld1q_u8(&block[ 48]));
423 const uint64x2_t m4 = vreinterpretq_u64_u8(vld1q_u8(&block[ 64]));
424 const uint64x2_t m5 = vreinterpretq_u64_u8(vld1q_u8(&block[ 80]));
425 const uint64x2_t m6 = vreinterpretq_u64_u8(vld1q_u8(&block[ 96]));
426 const uint64x2_t m7 = vreinterpretq_u64_u8(vld1q_u8(&block[112]));
427
428 uint64x2_t row1l, row1h, row2l, row2h;
429 uint64x2_t row3l, row3h, row4l, row4h;
430
431 const uint64x2_t h0 = row1l = vld1q_u64(&S->h[0]);
432 const uint64x2_t h1 = row1h = vld1q_u64(&S->h[2]);
433 const uint64x2_t h2 = row2l = vld1q_u64(&S->h[4]);
434 const uint64x2_t h3 = row2h = vld1q_u64(&S->h[6]);
435
436 row3l = vld1q_u64(&blake2b_IV[0]);
437 row3h = vld1q_u64(&blake2b_IV[2]);
438 row4l = veorq_u64(vld1q_u64(&blake2b_IV[4]), vld1q_u64(&S->t[0]));
439 row4h = veorq_u64(vld1q_u64(&blake2b_IV[6]), vld1q_u64(&S->f[0]));
440
441 ROUND( 0 );
442 ROUND( 1 );
443 ROUND( 2 );
444 ROUND( 3 );
445 ROUND( 4 );
446 ROUND( 5 );
447 ROUND( 6 );
448 ROUND( 7 );
449 ROUND( 8 );
450 ROUND( 9 );
451 ROUND( 10 );
452 ROUND( 11 );
453
454 vst1q_u64(&S->h[0], veorq_u64(h0, veorq_u64(row1l, row3l)));
455 vst1q_u64(&S->h[2], veorq_u64(h1, veorq_u64(row1h, row3h)));
456 vst1q_u64(&S->h[4], veorq_u64(h2, veorq_u64(row2l, row4l)));
457 vst1q_u64(&S->h[6], veorq_u64(h3, veorq_u64(row2h, row4h)));
458 }
459
460 #undef G
461 #undef ROUND
462
blake2b_update(blake2b_state * S,const void * pin,size_t inlen)463 int blake2b_update( blake2b_state *S, const void *pin, size_t inlen )
464 {
465 const unsigned char * in = (const unsigned char *)pin;
466 if( inlen > 0 )
467 {
468 size_t left = S->buflen;
469 size_t fill = BLAKE2B_BLOCKBYTES - left;
470 if( inlen > fill )
471 {
472 S->buflen = 0;
473 memcpy( S->buf + left, in, fill ); /* Fill buffer */
474 blake2b_increment_counter( S, BLAKE2B_BLOCKBYTES );
475 blake2b_compress( S, S->buf ); /* Compress */
476 in += fill; inlen -= fill;
477 while(inlen > BLAKE2B_BLOCKBYTES) {
478 blake2b_increment_counter(S, BLAKE2B_BLOCKBYTES);
479 blake2b_compress( S, in );
480 in += BLAKE2B_BLOCKBYTES;
481 inlen -= BLAKE2B_BLOCKBYTES;
482 }
483 }
484 memcpy( S->buf + S->buflen, in, inlen );
485 S->buflen += inlen;
486 }
487 return 0;
488 }
489
blake2b_final(blake2b_state * S,void * out,size_t outlen)490 int blake2b_final( blake2b_state *S, void *out, size_t outlen )
491 {
492 uint8_t buffer[BLAKE2B_OUTBYTES] = {0};
493 size_t i;
494
495 if( out == NULL || outlen < S->outlen )
496 return -1;
497
498 if( blake2b_is_lastblock( S ) )
499 return -1;
500
501 blake2b_increment_counter( S, S->buflen );
502 blake2b_set_lastblock( S );
503 memset( S->buf + S->buflen, 0, BLAKE2B_BLOCKBYTES - S->buflen ); /* Padding */
504 blake2b_compress( S, S->buf );
505
506 for( i = 0; i < 8; ++i ) /* Output full hash to temp buffer */
507 store64( buffer + sizeof( S->h[i] ) * i, S->h[i] );
508
509 memcpy( out, buffer, S->outlen );
510 secure_zero_memory(buffer, sizeof(buffer));
511 return 0;
512 }
513
514 /* inlen, at least, should be uint64_t. Others can be size_t. */
blake2b(void * out,size_t outlen,const void * in,size_t inlen,const void * key,size_t keylen)515 int blake2b( void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen )
516 {
517 blake2b_state S[1];
518
519 /* Verify parameters */
520 if ( NULL == in && inlen > 0 ) return -1;
521
522 if ( NULL == out ) return -1;
523
524 if( NULL == key && keylen > 0 ) return -1;
525
526 if( !outlen || outlen > BLAKE2B_OUTBYTES ) return -1;
527
528 if( keylen > BLAKE2B_KEYBYTES ) return -1;
529
530 if( keylen > 0 )
531 {
532 if( blake2b_init_key( S, outlen, key, keylen ) < 0 ) return -1;
533 }
534 else
535 {
536 if( blake2b_init( S, outlen ) < 0 ) return -1;
537 }
538
539 blake2b_update( S, ( const uint8_t * )in, inlen );
540 blake2b_final( S, out, outlen );
541 return 0;
542 }
543
blake2(void * out,size_t outlen,const void * in,size_t inlen,const void * key,size_t keylen)544 int blake2( void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen ) {
545 return blake2b(out, outlen, in, inlen, key, keylen);
546 }
547
548 #if defined(SUPERCOP)
crypto_hash(unsigned char * out,unsigned char * in,unsigned long long inlen)549 int crypto_hash( unsigned char *out, unsigned char *in, unsigned long long inlen )
550 {
551 return blake2b( out, BLAKE2B_OUTBYTES, in, inlen, NULL, 0 );
552 }
553 #endif
554
555 #if defined(BLAKE2B_SELFTEST)
556 #include <string.h>
557 #include "blake2-kat.h"
main(void)558 int main( void )
559 {
560 uint8_t key[BLAKE2B_KEYBYTES];
561 uint8_t buf[BLAKE2_KAT_LENGTH];
562 size_t i, step;
563
564 for( i = 0; i < BLAKE2B_KEYBYTES; ++i )
565 key[i] = ( uint8_t )i;
566
567 for( i = 0; i < BLAKE2_KAT_LENGTH; ++i )
568 buf[i] = ( uint8_t )i;
569
570 /* Test simple API */
571 for( i = 0; i < BLAKE2_KAT_LENGTH; ++i )
572 {
573 uint8_t hash[BLAKE2B_OUTBYTES];
574 blake2b( hash, BLAKE2B_OUTBYTES, buf, i, key, BLAKE2B_KEYBYTES );
575
576 if( 0 != memcmp( hash, blake2b_keyed_kat[i], BLAKE2B_OUTBYTES ) )
577 {
578 goto fail;
579 }
580 }
581
582 /* Test streaming API */
583 for(step = 1; step < BLAKE2B_BLOCKBYTES; ++step) {
584 for (i = 0; i < BLAKE2_KAT_LENGTH; ++i) {
585 uint8_t hash[BLAKE2B_OUTBYTES];
586 blake2b_state S;
587 uint8_t * p = buf;
588 size_t mlen = i;
589 int err = 0;
590
591 if( (err = blake2b_init_key(&S, BLAKE2B_OUTBYTES, key, BLAKE2B_KEYBYTES)) < 0 ) {
592 goto fail;
593 }
594
595 while (mlen >= step) {
596 if ( (err = blake2b_update(&S, p, step)) < 0 ) {
597 goto fail;
598 }
599 mlen -= step;
600 p += step;
601 }
602 if ( (err = blake2b_update(&S, p, mlen)) < 0) {
603 goto fail;
604 }
605 if ( (err = blake2b_final(&S, hash, BLAKE2B_OUTBYTES)) < 0) {
606 goto fail;
607 }
608
609 if (0 != memcmp(hash, blake2b_keyed_kat[i], BLAKE2B_OUTBYTES)) {
610 goto fail;
611 }
612 }
613 }
614
615 puts( "ok" );
616 return 0;
617 fail:
618 puts("error");
619 return -1;
620 }
621 #endif
622