1 #include "arch.h"
2 #if !defined(JOHN_NO_SIMD) && (defined(__SSE2__) || defined(__SSE4_1__) || defined(__XOP__))
3 /*
4    BLAKE2 reference source code package - optimized C implementations
5 
6    Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
7 
8    To the extent possible under law, the author(s) have dedicated all copyright
9    and related and neighboring rights to this software to the public domain
10    worldwide. This software is distributed without any warranty.
11 
12    You should have received a copy of the CC0 Public Domain Dedication along with
13    this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
14 */
15 
16 #include <stdint.h>
17 #include <string.h>
18 #include <stdio.h>
19 
20 #include "blake2.h"
21 #include "blake2-impl.h"
22 
23 #include <emmintrin.h>
24 #if defined(__SSSE3__)
25 #include <tmmintrin.h>
26 #endif
27 #if defined(__SSE4_1__)
28 #include <smmintrin.h>
29 #endif
30 #if defined(__AVX__)
31 #include <immintrin.h>
32 #endif
33 #if defined(__XOP__)
34 #include <x86intrin.h>
35 #endif
36 
37 #include "blake2b-round.h"
38 
39 JTR_ALIGN( 64 ) static const uint64_t blake2b_IV[8] =
40 {
41   0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL,
42   0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL,
43   0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
44   0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
45 };
46 
47 /* Some helper functions, not necessarily useful */
blake2b_set_lastnode(blake2b_state * S)48 inline static int blake2b_set_lastnode( blake2b_state *S )
49 {
50   S->f[1] = ~0ULL;
51   return 0;
52 }
53 
blake2b_set_lastblock(blake2b_state * S)54 inline static int blake2b_set_lastblock( blake2b_state *S )
55 {
56   if ( S->last_node ) blake2b_set_lastnode( S );
57 
58   S->f[0] = ~0ULL;
59   return 0;
60 }
61 
blake2b_increment_counter(blake2b_state * S,const uint64_t inc)62 inline static int blake2b_increment_counter( blake2b_state *S, const uint64_t inc )
63 {
64 #if __x86_64__
65   // ADD/ADC chain
66   __uint128_t t = ( ( __uint128_t )S->t[1] << 64 ) | S->t[0];
67   t += inc;
68   S->t[0] = ( uint64_t )( t >>  0 );
69   S->t[1] = ( uint64_t )( t >> 64 );
70 #else
71   S->t[0] += inc;
72   S->t[1] += ( S->t[0] < inc );
73 #endif
74   return 0;
75 }
76 
77 
78 /* init xors IV with input parameter block */
blake2b_init_param(blake2b_state * S,const blake2b_param * P)79 int blake2b_init_param( blake2b_state *S, const blake2b_param *P )
80 {
81   uint8_t *p, *h, *v;
82   int i;
83   //blake2b_init0( S );
84   v = ( uint8_t * )( blake2b_IV );
85   h = ( uint8_t * )( S->h );
86   p = ( uint8_t * )( P );
87   /* IV XOR ParamBlock */
88   memset( S, 0, sizeof( blake2b_state ) );
89 
90   for ( i = 0; i < BLAKE2B_OUTBYTES; ++i ) h[i] = v[i] ^ p[i];
91 
92   return 0;
93 }
94 
95 
96 /* Some sort of default parameter block initialization, for sequential blake2b */
blake2b_init(blake2b_state * S,const uint8_t outlen)97 int blake2b_init( blake2b_state *S, const uint8_t outlen )
98 {
99   const blake2b_param P =
100   {
101     outlen,
102     0,
103     1,
104     1,
105     0,
106     0,
107     0,
108     0,
109     {0},
110     {0},
111     {0}
112   };
113   if ( ( !outlen ) || ( outlen > BLAKE2B_OUTBYTES ) ) return -1;
114   return blake2b_init_param( S, &P );
115 }
116 
blake2b_init_key(blake2b_state * S,const uint8_t outlen,const void * key,const uint8_t keylen)117 int blake2b_init_key( blake2b_state *S, const uint8_t outlen, const void *key, const uint8_t keylen )
118 {
119   const blake2b_param P =
120   {
121     outlen,
122     keylen,
123     1,
124     1,
125     0,
126     0,
127     0,
128     0,
129     {0},
130     {0},
131     {0}
132   };
133 
134   if ( ( !outlen ) || ( outlen > BLAKE2B_OUTBYTES ) ) return -1;
135 
136   if ( ( !keylen ) || keylen > BLAKE2B_KEYBYTES ) return -1;
137 
138   if ( blake2b_init_param( S, &P ) < 0 )
139     return 0;
140 
141   {
142     uint8_t block[BLAKE2B_BLOCKBYTES];
143     memset( block, 0, BLAKE2B_BLOCKBYTES );
144     memcpy( block, key, keylen );
145     blake2b_update( S, block, BLAKE2B_BLOCKBYTES );
146     //secure_zero_memory( block, BLAKE2B_BLOCKBYTES ); /* Burn the key from stack */
147   }
148   return 0;
149 }
150 
blake2b_compress(blake2b_state * S,const uint8_t block[BLAKE2B_BLOCKBYTES])151 inline static int blake2b_compress( blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES] )
152 {
153   __m128i row1l, row1h;
154   __m128i row2l, row2h;
155   __m128i row3l, row3h;
156   __m128i row4l, row4h;
157   __m128i b0, b1;
158   __m128i t0, t1;
159 #if defined(__SSSE3__) && !defined(__XOP__)
160   const __m128i r16 = _mm_setr_epi8( 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9 );
161   const __m128i r24 = _mm_setr_epi8( 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10 );
162 #endif
163 #if defined(__SSE4_1__)
164   const __m128i m0 = LOADU( block + 00 );
165   const __m128i m1 = LOADU( block + 16 );
166   const __m128i m2 = LOADU( block + 32 );
167   const __m128i m3 = LOADU( block + 48 );
168   const __m128i m4 = LOADU( block + 64 );
169   const __m128i m5 = LOADU( block + 80 );
170   const __m128i m6 = LOADU( block + 96 );
171   const __m128i m7 = LOADU( block + 112 );
172 #else
173   const uint64_t  m0 = ( ( uint64_t * )block )[ 0];
174   const uint64_t  m1 = ( ( uint64_t * )block )[ 1];
175   const uint64_t  m2 = ( ( uint64_t * )block )[ 2];
176   const uint64_t  m3 = ( ( uint64_t * )block )[ 3];
177   const uint64_t  m4 = ( ( uint64_t * )block )[ 4];
178   const uint64_t  m5 = ( ( uint64_t * )block )[ 5];
179   const uint64_t  m6 = ( ( uint64_t * )block )[ 6];
180   const uint64_t  m7 = ( ( uint64_t * )block )[ 7];
181   const uint64_t  m8 = ( ( uint64_t * )block )[ 8];
182   const uint64_t  m9 = ( ( uint64_t * )block )[ 9];
183   const uint64_t m10 = ( ( uint64_t * )block )[10];
184   const uint64_t m11 = ( ( uint64_t * )block )[11];
185   const uint64_t m12 = ( ( uint64_t * )block )[12];
186   const uint64_t m13 = ( ( uint64_t * )block )[13];
187   const uint64_t m14 = ( ( uint64_t * )block )[14];
188   const uint64_t m15 = ( ( uint64_t * )block )[15];
189 #endif
190   row1l = LOAD( &S->h[0] );
191   row1h = LOAD( &S->h[2] );
192   row2l = LOAD( &S->h[4] );
193   row2h = LOAD( &S->h[6] );
194   row3l = LOAD( &blake2b_IV[0] );
195   row3h = LOAD( &blake2b_IV[2] );
196   row4l = _mm_xor_si128( LOAD( &blake2b_IV[4] ), LOAD( &S->t[0] ) );
197   row4h = _mm_xor_si128( LOAD( &blake2b_IV[6] ), LOAD( &S->f[0] ) );
198   ROUND( 0 );
199   ROUND( 1 );
200   ROUND( 2 );
201   ROUND( 3 );
202   ROUND( 4 );
203   ROUND( 5 );
204   ROUND( 6 );
205   ROUND( 7 );
206   ROUND( 8 );
207   ROUND( 9 );
208   ROUND( 10 );
209   ROUND( 11 );
210   row1l = _mm_xor_si128( row3l, row1l );
211   row1h = _mm_xor_si128( row3h, row1h );
212   STORE( &S->h[0], _mm_xor_si128( LOAD( &S->h[0] ), row1l ) );
213   STORE( &S->h[2], _mm_xor_si128( LOAD( &S->h[2] ), row1h ) );
214   row2l = _mm_xor_si128( row4l, row2l );
215   row2h = _mm_xor_si128( row4h, row2h );
216   STORE( &S->h[4], _mm_xor_si128( LOAD( &S->h[4] ), row2l ) );
217   STORE( &S->h[6], _mm_xor_si128( LOAD( &S->h[6] ), row2h ) );
218   return 0;
219 }
220 
221 
blake2b_update(blake2b_state * S,const uint8_t * in,uint64_t inlen)222 int blake2b_update( blake2b_state *S, const uint8_t *in, uint64_t inlen )
223 {
224   while( inlen > 0 )
225   {
226     size_t left = S->buflen;
227     size_t fill = 2 * BLAKE2B_BLOCKBYTES - left;
228 
229     if ( inlen > fill )
230     {
231       memcpy( S->buf + left, in, fill ); // Fill buffer
232       S->buflen += fill;
233       blake2b_increment_counter( S, BLAKE2B_BLOCKBYTES );
234       blake2b_compress( S, S->buf ); // Compress
235       memcpy( S->buf, S->buf + BLAKE2B_BLOCKBYTES, BLAKE2B_BLOCKBYTES ); // Shift buffer left
236       S->buflen -= BLAKE2B_BLOCKBYTES;
237       in += fill;
238       inlen -= fill;
239     }
240     else // inlen <= fill
241     {
242       memcpy( S->buf + left, in, inlen );
243       S->buflen += inlen; // Be lazy, do not compress
244       in += inlen;
245       inlen -= inlen;
246     }
247   }
248 
249   return 0;
250 }
251 
252 
blake2b_final(blake2b_state * S,uint8_t * out,uint8_t outlen)253 int blake2b_final( blake2b_state *S, uint8_t *out, uint8_t outlen )
254 {
255   if ( S->buflen > BLAKE2B_BLOCKBYTES )
256   {
257     blake2b_increment_counter( S, BLAKE2B_BLOCKBYTES );
258     blake2b_compress( S, S->buf );
259     S->buflen -= BLAKE2B_BLOCKBYTES;
260     memcpy( S->buf, S->buf + BLAKE2B_BLOCKBYTES, S->buflen );
261   }
262 
263   blake2b_increment_counter( S, S->buflen );
264   blake2b_set_lastblock( S );
265   memset( S->buf + S->buflen, 0, 2 * BLAKE2B_BLOCKBYTES - S->buflen ); /* Padding */
266   blake2b_compress( S, S->buf );
267   memcpy( out, &S->h[0], outlen );
268   return 0;
269 }
270 
271 
blake2b(uint8_t * out,const void * in,const void * key,const uint8_t outlen,const uint64_t inlen,uint8_t keylen)272 int blake2b( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen )
273 {
274   blake2b_state S[1];
275 
276   /* Verify parameters */
277   if ( NULL == in ) return -1;
278 
279   if ( NULL == out ) return -1;
280 
281   if ( NULL == key ) keylen = 0;
282 
283   if ( keylen )
284   {
285     if ( blake2b_init_key( S, outlen, key, keylen ) < 0 ) return -1;
286   }
287   else
288   {
289     if ( blake2b_init( S, outlen ) < 0 ) return -1;
290   }
291 
292   blake2b_update( S, ( uint8_t * )in, inlen );
293   blake2b_final( S, out, outlen );
294   return 0;
295 }
296 
297 #endif
298