1 /*
2    BLAKE2 reference source code package - reference C implementations
3 
4    Copyright 2012, Samuel Neves <sneves@dei.uc.pt>.  You may use this under the
5    terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at
6    your option.  The terms of these licenses can be found at:
7 
8    - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
9    - OpenSSL license   : https://www.openssl.org/source/license.html
10    - Apache 2.0        : http://www.apache.org/licenses/LICENSE-2.0
11 
12    More information about the BLAKE2 hash function can be found at
13    https://blake2.net.
14 */
15 
16 #include <stdint.h>
17 #include <string.h>
18 #include <stdio.h>
19 #include <arm_neon.h>
20 
21 #include "blake2.h"
22 #include "blake2-impl.h"
23 
24 static const uint64_t blake2b_IV[8] =
25 {
26   0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL,
27   0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL,
28   0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
29   0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
30 };
31 
32 /*
33 static const uint8_t blake2b_sigma[12][16] =
34 {
35   {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 } ,
36   { 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 } ,
37   { 11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4 } ,
38   {  7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8 } ,
39   {  9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13 } ,
40   {  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 } ,
41   { 12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11 } ,
42   { 13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10 } ,
43   {  6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5 } ,
44   { 10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13 , 0 } ,
45   {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 } ,
46   { 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 }
47 };
48 */
49 
blake2b_set_lastnode(blake2b_state * S)50 static void blake2b_set_lastnode( blake2b_state *S )
51 {
52   S->f[1] = (uint64_t)-1;
53 }
54 
55 /* Some helper functions, not necessarily useful */
blake2b_is_lastblock(const blake2b_state * S)56 static int blake2b_is_lastblock( const blake2b_state *S )
57 {
58   return S->f[0] != 0;
59 }
60 
blake2b_set_lastblock(blake2b_state * S)61 static void blake2b_set_lastblock( blake2b_state *S )
62 {
63   if( S->last_node ) blake2b_set_lastnode( S );
64 
65   S->f[0] = (uint64_t)-1;
66 }
67 
blake2b_increment_counter(blake2b_state * S,const uint64_t inc)68 static void blake2b_increment_counter( blake2b_state *S, const uint64_t inc )
69 {
70   S->t[0] += inc;
71   S->t[1] += ( S->t[0] < inc );
72 }
73 
blake2b_init0(blake2b_state * S)74 static void blake2b_init0( blake2b_state *S )
75 {
76   size_t i;
77   memset( S, 0, sizeof( blake2b_state ) );
78 
79   for( i = 0; i < 8; ++i ) S->h[i] = blake2b_IV[i];
80 }
81 
82 /* init xors IV with input parameter block */
blake2b_init_param(blake2b_state * S,const blake2b_param * P)83 int blake2b_init_param( blake2b_state *S, const blake2b_param *P )
84 {
85   const uint8_t *p = ( const uint8_t * )( P );
86   size_t i;
87 
88   blake2b_init0( S );
89 
90   /* IV XOR ParamBlock */
91   for( i = 0; i < 8; ++i )
92     S->h[i] ^= load64( p + sizeof( S->h[i] ) * i );
93 
94   S->outlen = P->digest_length;
95   return 0;
96 }
97 
98 
99 
blake2b_init(blake2b_state * S,size_t outlen)100 int blake2b_init( blake2b_state *S, size_t outlen )
101 {
102   blake2b_param P[1];
103 
104   if ( ( !outlen ) || ( outlen > BLAKE2B_OUTBYTES ) ) return -1;
105 
106   P->digest_length = (uint8_t)outlen;
107   P->key_length    = 0;
108   P->fanout        = 1;
109   P->depth         = 1;
110   store32( &P->leaf_length, 0 );
111   store32( &P->node_offset, 0 );
112   store32( &P->xof_length, 0 );
113   P->node_depth    = 0;
114   P->inner_length  = 0;
115   memset( P->reserved, 0, sizeof( P->reserved ) );
116   memset( P->salt,     0, sizeof( P->salt ) );
117   memset( P->personal, 0, sizeof( P->personal ) );
118   return blake2b_init_param( S, P );
119 }
120 
121 
blake2b_init_key(blake2b_state * S,size_t outlen,const void * key,size_t keylen)122 int blake2b_init_key( blake2b_state *S, size_t outlen, const void *key, size_t keylen )
123 {
124   blake2b_param P[1];
125 
126   if ( ( !outlen ) || ( outlen > BLAKE2B_OUTBYTES ) ) return -1;
127 
128   if ( !key || !keylen || keylen > BLAKE2B_KEYBYTES ) return -1;
129 
130   P->digest_length = (uint8_t)outlen;
131   P->key_length    = (uint8_t)keylen;
132   P->fanout        = 1;
133   P->depth         = 1;
134   store32( &P->leaf_length, 0 );
135   store32( &P->node_offset, 0 );
136   store32( &P->xof_length, 0 );
137   P->node_depth    = 0;
138   P->inner_length  = 0;
139   memset( P->reserved, 0, sizeof( P->reserved ) );
140   memset( P->salt,     0, sizeof( P->salt ) );
141   memset( P->personal, 0, sizeof( P->personal ) );
142 
143   if( blake2b_init_param( S, P ) < 0 ) return -1;
144 
145   {
146     uint8_t block[BLAKE2B_BLOCKBYTES];
147     memset( block, 0, BLAKE2B_BLOCKBYTES );
148     memcpy( block, key, keylen );
149     blake2b_update( S, block, BLAKE2B_BLOCKBYTES );
150     secure_zero_memory( block, BLAKE2B_BLOCKBYTES ); /* Burn the key from stack */
151   }
152   return 0;
153 }
154 
155 #undef LOAD_MSG_0_1
156 #define LOAD_MSG_0_1(b0, b1) \
157 do { b0 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m1)); b1 = vcombine_u64(vget_low_u64(m2), vget_low_u64(m3)); } while(0)
158 
159 #undef LOAD_MSG_0_2
160 #define LOAD_MSG_0_2(b0, b1) \
161   do { b0 = vcombine_u64(vget_high_u64(m0), vget_high_u64(m1)); b1 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m3)); } while(0)
162 
163 #undef LOAD_MSG_0_3
164 #define LOAD_MSG_0_3(b0, b1) \
165   do { b0 = vcombine_u64(vget_low_u64(m4), vget_low_u64(m5)); b1 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m7)); } while(0)
166 
167 #undef LOAD_MSG_0_4
168 #define LOAD_MSG_0_4(b0, b1) \
169   do { b0 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m5)); b1 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m7)); } while(0)
170 
171 #undef LOAD_MSG_1_1
172 #define LOAD_MSG_1_1(b0, b1) \
173   do { b0 = vcombine_u64(vget_low_u64(m7), vget_low_u64(m2)); b1 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m6)); } while(0)
174 
175 #undef LOAD_MSG_1_2
176 #define LOAD_MSG_1_2(b0, b1) \
177   do { b0 = vcombine_u64(vget_low_u64(m5), vget_low_u64(m4)); b1 = vextq_u64(m7, m3, 1); } while(0)
178 
179 #undef LOAD_MSG_1_3
180 #define LOAD_MSG_1_3(b0, b1) \
181   do { b0 = vextq_u64(m0, m0, 1); b1 = vcombine_u64(vget_high_u64(m5), vget_high_u64(m2)); } while(0)
182 
183 #undef LOAD_MSG_1_4
184 #define LOAD_MSG_1_4(b0, b1) \
185   do { b0 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m1)); b1 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m1)); } while(0)
186 
187 #undef LOAD_MSG_2_1
188 #define LOAD_MSG_2_1(b0, b1) \
189   do { b0 = vextq_u64(m5, m6, 1); b1 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m7)); } while(0)
190 
191 #undef LOAD_MSG_2_2
192 #define LOAD_MSG_2_2(b0, b1) \
193   do { b0 = vcombine_u64(vget_low_u64(m4), vget_low_u64(m0)); b1 = vcombine_u64(vget_low_u64(m1), vget_high_u64(m6)); } while(0)
194 
195 #undef LOAD_MSG_2_3
196 #define LOAD_MSG_2_3(b0, b1) \
197   do { b0 = vcombine_u64(vget_low_u64(m5), vget_high_u64(m1)); b1 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m4)); } while(0)
198 
199 #undef LOAD_MSG_2_4
200 #define LOAD_MSG_2_4(b0, b1) \
201   do { b0 = vcombine_u64(vget_low_u64(m7), vget_low_u64(m3)); b1 = vextq_u64(m0, m2, 1); } while(0)
202 
203 #undef LOAD_MSG_3_1
204 #define LOAD_MSG_3_1(b0, b1) \
205   do { b0 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m1)); b1 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m5)); } while(0)
206 
207 #undef LOAD_MSG_3_2
208 #define LOAD_MSG_3_2(b0, b1) \
209   do { b0 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m0)); b1 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m7)); } while(0)
210 
211 #undef LOAD_MSG_3_3
212 #define LOAD_MSG_3_3(b0, b1) \
213   do { b0 = vcombine_u64(vget_low_u64(m1), vget_high_u64(m2)); b1 = vcombine_u64(vget_low_u64(m2), vget_high_u64(m7)); } while(0)
214 
215 #undef LOAD_MSG_3_4
216 #define LOAD_MSG_3_4(b0, b1) \
217   do { b0 = vcombine_u64(vget_low_u64(m3), vget_low_u64(m5)); b1 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m4)); } while(0)
218 
219 #undef LOAD_MSG_4_1
220 #define LOAD_MSG_4_1(b0, b1) \
221   do { b0 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m2)); b1 = vcombine_u64(vget_low_u64(m1), vget_low_u64(m5)); } while(0)
222 
223 #undef LOAD_MSG_4_2
224 #define LOAD_MSG_4_2(b0, b1) \
225   do { b0 = vcombine_u64(vget_low_u64(m0), vget_high_u64(m3)); b1 = vcombine_u64(vget_low_u64(m2), vget_high_u64(m7)); } while(0)
226 
227 #undef LOAD_MSG_4_3
228 #define LOAD_MSG_4_3(b0, b1) \
229   do { b0 = vcombine_u64(vget_low_u64(m7), vget_high_u64(m5)); b1 = vcombine_u64(vget_low_u64(m3), vget_high_u64(m1)); } while(0)
230 
231 #undef LOAD_MSG_4_4
232 #define LOAD_MSG_4_4(b0, b1) \
233   do { b0 = vextq_u64(m0, m6, 1); b1 = vcombine_u64(vget_low_u64(m4), vget_high_u64(m6)); } while(0)
234 
235 #undef LOAD_MSG_5_1
236 #define LOAD_MSG_5_1(b0, b1) \
237   do { b0 = vcombine_u64(vget_low_u64(m1), vget_low_u64(m3)); b1 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m4)); } while(0)
238 
239 #undef LOAD_MSG_5_2
240 #define LOAD_MSG_5_2(b0, b1) \
241   do { b0 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m5)); b1 = vcombine_u64(vget_high_u64(m5), vget_high_u64(m1)); } while(0)
242 
243 #undef LOAD_MSG_5_3
244 #define LOAD_MSG_5_3(b0, b1) \
245   do { b0 = vcombine_u64(vget_low_u64(m2), vget_high_u64(m3)); b1 = vcombine_u64(vget_high_u64(m7), vget_high_u64(m0)); } while(0)
246 
247 #undef LOAD_MSG_5_4
248 #define LOAD_MSG_5_4(b0, b1) \
249   do { b0 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m2)); b1 = vcombine_u64(vget_low_u64(m7), vget_high_u64(m4)); } while(0)
250 
251 #undef LOAD_MSG_6_1
252 #define LOAD_MSG_6_1(b0, b1) \
253   do { b0 = vcombine_u64(vget_low_u64(m6), vget_high_u64(m0)); b1 = vcombine_u64(vget_low_u64(m7), vget_low_u64(m2)); } while(0)
254 
255 #undef LOAD_MSG_6_2
256 #define LOAD_MSG_6_2(b0, b1) \
257   do { b0 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m7)); b1 = vextq_u64(m6, m5, 1); } while(0)
258 
259 #undef LOAD_MSG_6_3
260 #define LOAD_MSG_6_3(b0, b1) \
261   do { b0 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m3)); b1 = vextq_u64(m4, m4, 1); } while(0)
262 
263 #undef LOAD_MSG_6_4
264 #define LOAD_MSG_6_4(b0, b1) \
265   do { b0 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m1)); b1 = vcombine_u64(vget_low_u64(m1), vget_high_u64(m5)); } while(0)
266 
267 #undef LOAD_MSG_7_1
268 #define LOAD_MSG_7_1(b0, b1) \
269   do { b0 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m3)); b1 = vcombine_u64(vget_low_u64(m6), vget_high_u64(m1)); } while(0)
270 
271 #undef LOAD_MSG_7_2
272 #define LOAD_MSG_7_2(b0, b1) \
273   do { b0 = vextq_u64(m5, m7, 1); b1 = vcombine_u64(vget_high_u64(m0), vget_high_u64(m4)); } while(0)
274 
275 #undef LOAD_MSG_7_3
276 #define LOAD_MSG_7_3(b0, b1) \
277   do { b0 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m7)); b1 = vcombine_u64(vget_low_u64(m4), vget_low_u64(m1)); } while(0)
278 
279 #undef LOAD_MSG_7_4
280 #define LOAD_MSG_7_4(b0, b1) \
281   do { b0 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m2)); b1 = vcombine_u64(vget_low_u64(m3), vget_low_u64(m5)); } while(0)
282 
283 #undef LOAD_MSG_8_1
284 #define LOAD_MSG_8_1(b0, b1) \
285   do { b0 = vcombine_u64(vget_low_u64(m3), vget_low_u64(m7)); b1 = vextq_u64(m5, m0, 1); } while(0)
286 
287 #undef LOAD_MSG_8_2
288 #define LOAD_MSG_8_2(b0, b1) \
289   do { b0 = vcombine_u64(vget_high_u64(m7), vget_high_u64(m4)); b1 = vextq_u64(m1, m4, 1); } while(0)
290 
291 #undef LOAD_MSG_8_3
292 #define LOAD_MSG_8_3(b0, b1) \
293   do { b0 = m6; b1 = vextq_u64(m0, m5, 1); } while(0)
294 
295 #undef LOAD_MSG_8_4
296 #define LOAD_MSG_8_4(b0, b1) \
297   do { b0 = vcombine_u64(vget_low_u64(m1), vget_high_u64(m3)); b1 = m2; } while(0)
298 
299 #undef LOAD_MSG_9_1
300 #define LOAD_MSG_9_1(b0, b1) \
301   do { b0 = vcombine_u64(vget_low_u64(m5), vget_low_u64(m4)); b1 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m0)); } while(0)
302 
303 #undef LOAD_MSG_9_2
304 #define LOAD_MSG_9_2(b0, b1) \
305   do { b0 = vcombine_u64(vget_low_u64(m1), vget_low_u64(m2)); b1 = vcombine_u64(vget_low_u64(m3), vget_high_u64(m2)); } while(0)
306 
307 #undef LOAD_MSG_9_3
308 #define LOAD_MSG_9_3(b0, b1) \
309   do { b0 = vcombine_u64(vget_high_u64(m7), vget_high_u64(m4)); b1 = vcombine_u64(vget_high_u64(m1), vget_high_u64(m6)); } while(0)
310 
311 #undef LOAD_MSG_9_4
312 #define LOAD_MSG_9_4(b0, b1) \
313   do { b0 = vextq_u64(m5, m7, 1); b1 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m0)); } while(0)
314 
315 #undef LOAD_MSG_10_1
316 #define LOAD_MSG_10_1(b0, b1) \
317   do { b0 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m1)); b1 = vcombine_u64(vget_low_u64(m2), vget_low_u64(m3)); } while(0)
318 
319 #undef LOAD_MSG_10_2
320 #define LOAD_MSG_10_2(b0, b1) \
321   do { b0 = vcombine_u64(vget_high_u64(m0), vget_high_u64(m1)); b1 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m3)); } while(0)
322 
323 #undef LOAD_MSG_10_3
324 #define LOAD_MSG_10_3(b0, b1) \
325   do { b0 = vcombine_u64(vget_low_u64(m4), vget_low_u64(m5)); b1 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m7)); } while(0)
326 
327 #undef LOAD_MSG_10_4
328 #define LOAD_MSG_10_4(b0, b1) \
329   do { b0 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m5)); b1 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m7)); } while(0)
330 
331 #undef LOAD_MSG_11_1
332 #define LOAD_MSG_11_1(b0, b1) \
333   do { b0 = vcombine_u64(vget_low_u64(m7), vget_low_u64(m2)); b1 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m6)); } while(0)
334 
335 #undef LOAD_MSG_11_2
336 #define LOAD_MSG_11_2(b0, b1) \
337   do { b0 = vcombine_u64(vget_low_u64(m5), vget_low_u64(m4)); b1 = vextq_u64(m7, m3, 1); } while(0)
338 
339 #undef LOAD_MSG_11_3
340 #define LOAD_MSG_11_3(b0, b1) \
341   do { b0 = vextq_u64(m0, m0, 1); b1 = vcombine_u64(vget_high_u64(m5), vget_high_u64(m2)); } while(0)
342 
343 #undef LOAD_MSG_11_4
344 #define LOAD_MSG_11_4(b0, b1) \
345   do { b0 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m1)); b1 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m1)); } while(0)
346 
347 #define vrorq_n_u64_32(x) vreinterpretq_u64_u32(vrev64q_u32(vreinterpretq_u32_u64((x))))
348 
349 #define vrorq_n_u64_24(x) vcombine_u64( \
350       vreinterpret_u64_u8(vext_u8(vreinterpret_u8_u64(vget_low_u64(x)), vreinterpret_u8_u64(vget_low_u64(x)), 3)), \
351       vreinterpret_u64_u8(vext_u8(vreinterpret_u8_u64(vget_high_u64(x)), vreinterpret_u8_u64(vget_high_u64(x)), 3)))
352 
353 #define vrorq_n_u64_16(x) vcombine_u64( \
354       vreinterpret_u64_u8(vext_u8(vreinterpret_u8_u64(vget_low_u64(x)), vreinterpret_u8_u64(vget_low_u64(x)), 2)), \
355       vreinterpret_u64_u8(vext_u8(vreinterpret_u8_u64(vget_high_u64(x)), vreinterpret_u8_u64(vget_high_u64(x)), 2)))
356 
357 #define vrorq_n_u64_63(x) veorq_u64(vaddq_u64(x, x), vshrq_n_u64(x, 63))
358 
359 #undef G1
360 #define G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
361   do { \
362     row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l); \
363     row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h); \
364     row4l = veorq_u64(row4l, row1l); row4h = veorq_u64(row4h, row1h); \
365     row4l = vrorq_n_u64_32(row4l); row4h = vrorq_n_u64_32(row4h); \
366     row3l = vaddq_u64(row3l, row4l); row3h = vaddq_u64(row3h, row4h); \
367     row2l = veorq_u64(row2l, row3l); row2h = veorq_u64(row2h, row3h); \
368     row2l = vrorq_n_u64_24(row2l); row2h = vrorq_n_u64_24(row2h); \
369   } while(0)
370 
371 #undef G2
372 #define G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
373   do { \
374     row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l); \
375     row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h); \
376     row4l = veorq_u64(row4l, row1l); row4h = veorq_u64(row4h, row1h); \
377     row4l = vrorq_n_u64_16(row4l); row4h = vrorq_n_u64_16(row4h); \
378     row3l = vaddq_u64(row3l, row4l); row3h = vaddq_u64(row3h, row4h); \
379     row2l = veorq_u64(row2l, row3l); row2h = veorq_u64(row2h, row3h); \
380     row2l = vrorq_n_u64_63(row2l); row2h = vrorq_n_u64_63(row2h); \
381   } while(0)
382 
383 #define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
384   do { \
385     uint64x2_t t0 = vextq_u64(row2l, row2h, 1); \
386     uint64x2_t t1 = vextq_u64(row2h, row2l, 1); \
387     row2l = t0; row2h = t1; t0 = row3l;  row3l = row3h; row3h = t0; \
388     t0 = vextq_u64(row4h, row4l, 1); t1 = vextq_u64(row4l, row4h, 1); \
389     row4l = t0; row4h = t1; \
390   } while(0)
391 
392 #define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
393   do { \
394     uint64x2_t t0 = vextq_u64(row2h, row2l, 1); \
395     uint64x2_t t1 = vextq_u64(row2l, row2h, 1); \
396     row2l = t0; row2h = t1; t0 = row3l; row3l = row3h; row3h = t0; \
397     t0 = vextq_u64(row4l, row4h, 1); t1 = vextq_u64(row4h, row4l, 1); \
398     row4l = t0; row4h = t1; \
399   } while(0)
400 
401 #undef ROUND
402 #define ROUND(r) \
403   do { \
404     uint64x2_t b0, b1; \
405     LOAD_MSG_ ##r ##_1(b0, b1); \
406     G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
407     LOAD_MSG_ ##r ##_2(b0, b1); \
408     G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
409     DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
410     LOAD_MSG_ ##r ##_3(b0, b1); \
411     G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
412     LOAD_MSG_ ##r ##_4(b0, b1); \
413     G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
414     UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
415   } while(0)
416 
blake2b_compress(blake2b_state * S,const uint8_t block[BLAKE2B_BLOCKBYTES])417 static void blake2b_compress( blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES] )
418 {
419   const uint64x2_t m0 = vreinterpretq_u64_u8(vld1q_u8(&block[  0]));
420   const uint64x2_t m1 = vreinterpretq_u64_u8(vld1q_u8(&block[ 16]));
421   const uint64x2_t m2 = vreinterpretq_u64_u8(vld1q_u8(&block[ 32]));
422   const uint64x2_t m3 = vreinterpretq_u64_u8(vld1q_u8(&block[ 48]));
423   const uint64x2_t m4 = vreinterpretq_u64_u8(vld1q_u8(&block[ 64]));
424   const uint64x2_t m5 = vreinterpretq_u64_u8(vld1q_u8(&block[ 80]));
425   const uint64x2_t m6 = vreinterpretq_u64_u8(vld1q_u8(&block[ 96]));
426   const uint64x2_t m7 = vreinterpretq_u64_u8(vld1q_u8(&block[112]));
427 
428   uint64x2_t row1l, row1h, row2l, row2h;
429   uint64x2_t row3l, row3h, row4l, row4h;
430 
431   const uint64x2_t h0 = row1l = vld1q_u64(&S->h[0]);
432   const uint64x2_t h1 = row1h = vld1q_u64(&S->h[2]);
433   const uint64x2_t h2 = row2l = vld1q_u64(&S->h[4]);
434   const uint64x2_t h3 = row2h = vld1q_u64(&S->h[6]);
435 
436   row3l = vld1q_u64(&blake2b_IV[0]);
437   row3h = vld1q_u64(&blake2b_IV[2]);
438   row4l = veorq_u64(vld1q_u64(&blake2b_IV[4]), vld1q_u64(&S->t[0]));
439   row4h = veorq_u64(vld1q_u64(&blake2b_IV[6]), vld1q_u64(&S->f[0]));
440 
441   ROUND( 0 );
442   ROUND( 1 );
443   ROUND( 2 );
444   ROUND( 3 );
445   ROUND( 4 );
446   ROUND( 5 );
447   ROUND( 6 );
448   ROUND( 7 );
449   ROUND( 8 );
450   ROUND( 9 );
451   ROUND( 10 );
452   ROUND( 11 );
453 
454   vst1q_u64(&S->h[0], veorq_u64(h0, veorq_u64(row1l, row3l)));
455   vst1q_u64(&S->h[2], veorq_u64(h1, veorq_u64(row1h, row3h)));
456   vst1q_u64(&S->h[4], veorq_u64(h2, veorq_u64(row2l, row4l)));
457   vst1q_u64(&S->h[6], veorq_u64(h3, veorq_u64(row2h, row4h)));
458 }
459 
460 #undef G
461 #undef ROUND
462 
blake2b_update(blake2b_state * S,const void * pin,size_t inlen)463 int blake2b_update( blake2b_state *S, const void *pin, size_t inlen )
464 {
465   const unsigned char * in = (const unsigned char *)pin;
466   if( inlen > 0 )
467   {
468     size_t left = S->buflen;
469     size_t fill = BLAKE2B_BLOCKBYTES - left;
470     if( inlen > fill )
471     {
472       S->buflen = 0;
473       memcpy( S->buf + left, in, fill ); /* Fill buffer */
474       blake2b_increment_counter( S, BLAKE2B_BLOCKBYTES );
475       blake2b_compress( S, S->buf ); /* Compress */
476       in += fill; inlen -= fill;
477       while(inlen > BLAKE2B_BLOCKBYTES) {
478         blake2b_increment_counter(S, BLAKE2B_BLOCKBYTES);
479         blake2b_compress( S, in );
480         in += BLAKE2B_BLOCKBYTES;
481         inlen -= BLAKE2B_BLOCKBYTES;
482       }
483     }
484     memcpy( S->buf + S->buflen, in, inlen );
485     S->buflen += inlen;
486   }
487   return 0;
488 }
489 
blake2b_final(blake2b_state * S,void * out,size_t outlen)490 int blake2b_final( blake2b_state *S, void *out, size_t outlen )
491 {
492   uint8_t buffer[BLAKE2B_OUTBYTES] = {0};
493   size_t i;
494 
495   if( out == NULL || outlen < S->outlen )
496     return -1;
497 
498   if( blake2b_is_lastblock( S ) )
499     return -1;
500 
501   blake2b_increment_counter( S, S->buflen );
502   blake2b_set_lastblock( S );
503   memset( S->buf + S->buflen, 0, BLAKE2B_BLOCKBYTES - S->buflen ); /* Padding */
504   blake2b_compress( S, S->buf );
505 
506   for( i = 0; i < 8; ++i ) /* Output full hash to temp buffer */
507     store64( buffer + sizeof( S->h[i] ) * i, S->h[i] );
508 
509   memcpy( out, buffer, S->outlen );
510   secure_zero_memory(buffer, sizeof(buffer));
511   return 0;
512 }
513 
514 /* inlen, at least, should be uint64_t. Others can be size_t. */
blake2b(void * out,size_t outlen,const void * in,size_t inlen,const void * key,size_t keylen)515 int blake2b( void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen )
516 {
517   blake2b_state S[1];
518 
519   /* Verify parameters */
520   if ( NULL == in && inlen > 0 ) return -1;
521 
522   if ( NULL == out ) return -1;
523 
524   if( NULL == key && keylen > 0 ) return -1;
525 
526   if( !outlen || outlen > BLAKE2B_OUTBYTES ) return -1;
527 
528   if( keylen > BLAKE2B_KEYBYTES ) return -1;
529 
530   if( keylen > 0 )
531   {
532     if( blake2b_init_key( S, outlen, key, keylen ) < 0 ) return -1;
533   }
534   else
535   {
536     if( blake2b_init( S, outlen ) < 0 ) return -1;
537   }
538 
539   blake2b_update( S, ( const uint8_t * )in, inlen );
540   blake2b_final( S, out, outlen );
541   return 0;
542 }
543 
blake2(void * out,size_t outlen,const void * in,size_t inlen,const void * key,size_t keylen)544 int blake2( void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen ) {
545   return blake2b(out, outlen, in, inlen, key, keylen);
546 }
547 
548 #if defined(SUPERCOP)
crypto_hash(unsigned char * out,unsigned char * in,unsigned long long inlen)549 int crypto_hash( unsigned char *out, unsigned char *in, unsigned long long inlen )
550 {
551   return blake2b( out, BLAKE2B_OUTBYTES, in, inlen, NULL, 0 );
552 }
553 #endif
554 
555 #if defined(BLAKE2B_SELFTEST)
556 #include <string.h>
557 #include "blake2-kat.h"
main(void)558 int main( void )
559 {
560   uint8_t key[BLAKE2B_KEYBYTES];
561   uint8_t buf[BLAKE2_KAT_LENGTH];
562   size_t i, step;
563 
564   for( i = 0; i < BLAKE2B_KEYBYTES; ++i )
565     key[i] = ( uint8_t )i;
566 
567   for( i = 0; i < BLAKE2_KAT_LENGTH; ++i )
568     buf[i] = ( uint8_t )i;
569 
570   /* Test simple API */
571   for( i = 0; i < BLAKE2_KAT_LENGTH; ++i )
572   {
573     uint8_t hash[BLAKE2B_OUTBYTES];
574     blake2b( hash, BLAKE2B_OUTBYTES, buf, i, key, BLAKE2B_KEYBYTES );
575 
576     if( 0 != memcmp( hash, blake2b_keyed_kat[i], BLAKE2B_OUTBYTES ) )
577     {
578       goto fail;
579     }
580   }
581 
582   /* Test streaming API */
583   for(step = 1; step < BLAKE2B_BLOCKBYTES; ++step) {
584     for (i = 0; i < BLAKE2_KAT_LENGTH; ++i) {
585       uint8_t hash[BLAKE2B_OUTBYTES];
586       blake2b_state S;
587       uint8_t * p = buf;
588       size_t mlen = i;
589       int err = 0;
590 
591       if( (err = blake2b_init_key(&S, BLAKE2B_OUTBYTES, key, BLAKE2B_KEYBYTES)) < 0 ) {
592         goto fail;
593       }
594 
595       while (mlen >= step) {
596         if ( (err = blake2b_update(&S, p, step)) < 0 ) {
597           goto fail;
598         }
599         mlen -= step;
600         p += step;
601       }
602       if ( (err = blake2b_update(&S, p, mlen)) < 0) {
603         goto fail;
604       }
605       if ( (err = blake2b_final(&S, hash, BLAKE2B_OUTBYTES)) < 0) {
606         goto fail;
607       }
608 
609       if (0 != memcmp(hash, blake2b_keyed_kat[i], BLAKE2B_OUTBYTES)) {
610         goto fail;
611       }
612     }
613   }
614 
615   puts( "ok" );
616   return 0;
617 fail:
618   puts("error");
619   return -1;
620 }
621 #endif
622