1 #include "arch.h"
2 #if !defined(JOHN_NO_SIMD) && (defined(__SSE2__) || defined(__SSE4_1__) || defined(__XOP__))
3 /*
4 BLAKE2 reference source code package - optimized C implementations
5
6 Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
7
8 To the extent possible under law, the author(s) have dedicated all copyright
9 and related and neighboring rights to this software to the public domain
10 worldwide. This software is distributed without any warranty.
11
12 You should have received a copy of the CC0 Public Domain Dedication along with
13 this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
14 */
15
16 #include <stdint.h>
17 #include <string.h>
18 #include <stdio.h>
19
20 #include "blake2.h"
21 #include "blake2-impl.h"
22
23 #include <emmintrin.h>
24 #if defined(__SSSE3__)
25 #include <tmmintrin.h>
26 #endif
27 #if defined(__SSE4_1__)
28 #include <smmintrin.h>
29 #endif
30 #if defined(__AVX__)
31 #include <immintrin.h>
32 #endif
33 #if defined(__XOP__)
34 #include <x86intrin.h>
35 #endif
36
37 #include "blake2b-round.h"
38
39 JTR_ALIGN( 64 ) static const uint64_t blake2b_IV[8] =
40 {
41 0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL,
42 0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL,
43 0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
44 0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
45 };
46
47 /* Some helper functions, not necessarily useful */
blake2b_set_lastnode(blake2b_state * S)48 inline static int blake2b_set_lastnode( blake2b_state *S )
49 {
50 S->f[1] = ~0ULL;
51 return 0;
52 }
53
blake2b_set_lastblock(blake2b_state * S)54 inline static int blake2b_set_lastblock( blake2b_state *S )
55 {
56 if ( S->last_node ) blake2b_set_lastnode( S );
57
58 S->f[0] = ~0ULL;
59 return 0;
60 }
61
blake2b_increment_counter(blake2b_state * S,const uint64_t inc)62 inline static int blake2b_increment_counter( blake2b_state *S, const uint64_t inc )
63 {
64 #if __x86_64__
65 // ADD/ADC chain
66 __uint128_t t = ( ( __uint128_t )S->t[1] << 64 ) | S->t[0];
67 t += inc;
68 S->t[0] = ( uint64_t )( t >> 0 );
69 S->t[1] = ( uint64_t )( t >> 64 );
70 #else
71 S->t[0] += inc;
72 S->t[1] += ( S->t[0] < inc );
73 #endif
74 return 0;
75 }
76
77
78 /* init xors IV with input parameter block */
blake2b_init_param(blake2b_state * S,const blake2b_param * P)79 int blake2b_init_param( blake2b_state *S, const blake2b_param *P )
80 {
81 uint8_t *p, *h, *v;
82 int i;
83 //blake2b_init0( S );
84 v = ( uint8_t * )( blake2b_IV );
85 h = ( uint8_t * )( S->h );
86 p = ( uint8_t * )( P );
87 /* IV XOR ParamBlock */
88 memset( S, 0, sizeof( blake2b_state ) );
89
90 for ( i = 0; i < BLAKE2B_OUTBYTES; ++i ) h[i] = v[i] ^ p[i];
91
92 return 0;
93 }
94
95
96 /* Some sort of default parameter block initialization, for sequential blake2b */
blake2b_init(blake2b_state * S,const uint8_t outlen)97 int blake2b_init( blake2b_state *S, const uint8_t outlen )
98 {
99 const blake2b_param P =
100 {
101 outlen,
102 0,
103 1,
104 1,
105 0,
106 0,
107 0,
108 0,
109 {0},
110 {0},
111 {0}
112 };
113 if ( ( !outlen ) || ( outlen > BLAKE2B_OUTBYTES ) ) return -1;
114 return blake2b_init_param( S, &P );
115 }
116
blake2b_init_key(blake2b_state * S,const uint8_t outlen,const void * key,const uint8_t keylen)117 int blake2b_init_key( blake2b_state *S, const uint8_t outlen, const void *key, const uint8_t keylen )
118 {
119 const blake2b_param P =
120 {
121 outlen,
122 keylen,
123 1,
124 1,
125 0,
126 0,
127 0,
128 0,
129 {0},
130 {0},
131 {0}
132 };
133
134 if ( ( !outlen ) || ( outlen > BLAKE2B_OUTBYTES ) ) return -1;
135
136 if ( ( !keylen ) || keylen > BLAKE2B_KEYBYTES ) return -1;
137
138 if ( blake2b_init_param( S, &P ) < 0 )
139 return 0;
140
141 {
142 uint8_t block[BLAKE2B_BLOCKBYTES];
143 memset( block, 0, BLAKE2B_BLOCKBYTES );
144 memcpy( block, key, keylen );
145 blake2b_update( S, block, BLAKE2B_BLOCKBYTES );
146 //secure_zero_memory( block, BLAKE2B_BLOCKBYTES ); /* Burn the key from stack */
147 }
148 return 0;
149 }
150
blake2b_compress(blake2b_state * S,const uint8_t block[BLAKE2B_BLOCKBYTES])151 inline static int blake2b_compress( blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES] )
152 {
153 __m128i row1l, row1h;
154 __m128i row2l, row2h;
155 __m128i row3l, row3h;
156 __m128i row4l, row4h;
157 __m128i b0, b1;
158 __m128i t0, t1;
159 #if defined(__SSSE3__) && !defined(__XOP__)
160 const __m128i r16 = _mm_setr_epi8( 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9 );
161 const __m128i r24 = _mm_setr_epi8( 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10 );
162 #endif
163 #if defined(__SSE4_1__)
164 const __m128i m0 = LOADU( block + 00 );
165 const __m128i m1 = LOADU( block + 16 );
166 const __m128i m2 = LOADU( block + 32 );
167 const __m128i m3 = LOADU( block + 48 );
168 const __m128i m4 = LOADU( block + 64 );
169 const __m128i m5 = LOADU( block + 80 );
170 const __m128i m6 = LOADU( block + 96 );
171 const __m128i m7 = LOADU( block + 112 );
172 #else
173 const uint64_t m0 = ( ( uint64_t * )block )[ 0];
174 const uint64_t m1 = ( ( uint64_t * )block )[ 1];
175 const uint64_t m2 = ( ( uint64_t * )block )[ 2];
176 const uint64_t m3 = ( ( uint64_t * )block )[ 3];
177 const uint64_t m4 = ( ( uint64_t * )block )[ 4];
178 const uint64_t m5 = ( ( uint64_t * )block )[ 5];
179 const uint64_t m6 = ( ( uint64_t * )block )[ 6];
180 const uint64_t m7 = ( ( uint64_t * )block )[ 7];
181 const uint64_t m8 = ( ( uint64_t * )block )[ 8];
182 const uint64_t m9 = ( ( uint64_t * )block )[ 9];
183 const uint64_t m10 = ( ( uint64_t * )block )[10];
184 const uint64_t m11 = ( ( uint64_t * )block )[11];
185 const uint64_t m12 = ( ( uint64_t * )block )[12];
186 const uint64_t m13 = ( ( uint64_t * )block )[13];
187 const uint64_t m14 = ( ( uint64_t * )block )[14];
188 const uint64_t m15 = ( ( uint64_t * )block )[15];
189 #endif
190 row1l = LOAD( &S->h[0] );
191 row1h = LOAD( &S->h[2] );
192 row2l = LOAD( &S->h[4] );
193 row2h = LOAD( &S->h[6] );
194 row3l = LOAD( &blake2b_IV[0] );
195 row3h = LOAD( &blake2b_IV[2] );
196 row4l = _mm_xor_si128( LOAD( &blake2b_IV[4] ), LOAD( &S->t[0] ) );
197 row4h = _mm_xor_si128( LOAD( &blake2b_IV[6] ), LOAD( &S->f[0] ) );
198 ROUND( 0 );
199 ROUND( 1 );
200 ROUND( 2 );
201 ROUND( 3 );
202 ROUND( 4 );
203 ROUND( 5 );
204 ROUND( 6 );
205 ROUND( 7 );
206 ROUND( 8 );
207 ROUND( 9 );
208 ROUND( 10 );
209 ROUND( 11 );
210 row1l = _mm_xor_si128( row3l, row1l );
211 row1h = _mm_xor_si128( row3h, row1h );
212 STORE( &S->h[0], _mm_xor_si128( LOAD( &S->h[0] ), row1l ) );
213 STORE( &S->h[2], _mm_xor_si128( LOAD( &S->h[2] ), row1h ) );
214 row2l = _mm_xor_si128( row4l, row2l );
215 row2h = _mm_xor_si128( row4h, row2h );
216 STORE( &S->h[4], _mm_xor_si128( LOAD( &S->h[4] ), row2l ) );
217 STORE( &S->h[6], _mm_xor_si128( LOAD( &S->h[6] ), row2h ) );
218 return 0;
219 }
220
221
blake2b_update(blake2b_state * S,const uint8_t * in,uint64_t inlen)222 int blake2b_update( blake2b_state *S, const uint8_t *in, uint64_t inlen )
223 {
224 while( inlen > 0 )
225 {
226 size_t left = S->buflen;
227 size_t fill = 2 * BLAKE2B_BLOCKBYTES - left;
228
229 if ( inlen > fill )
230 {
231 memcpy( S->buf + left, in, fill ); // Fill buffer
232 S->buflen += fill;
233 blake2b_increment_counter( S, BLAKE2B_BLOCKBYTES );
234 blake2b_compress( S, S->buf ); // Compress
235 memcpy( S->buf, S->buf + BLAKE2B_BLOCKBYTES, BLAKE2B_BLOCKBYTES ); // Shift buffer left
236 S->buflen -= BLAKE2B_BLOCKBYTES;
237 in += fill;
238 inlen -= fill;
239 }
240 else // inlen <= fill
241 {
242 memcpy( S->buf + left, in, inlen );
243 S->buflen += inlen; // Be lazy, do not compress
244 in += inlen;
245 inlen -= inlen;
246 }
247 }
248
249 return 0;
250 }
251
252
blake2b_final(blake2b_state * S,uint8_t * out,uint8_t outlen)253 int blake2b_final( blake2b_state *S, uint8_t *out, uint8_t outlen )
254 {
255 if ( S->buflen > BLAKE2B_BLOCKBYTES )
256 {
257 blake2b_increment_counter( S, BLAKE2B_BLOCKBYTES );
258 blake2b_compress( S, S->buf );
259 S->buflen -= BLAKE2B_BLOCKBYTES;
260 memcpy( S->buf, S->buf + BLAKE2B_BLOCKBYTES, S->buflen );
261 }
262
263 blake2b_increment_counter( S, S->buflen );
264 blake2b_set_lastblock( S );
265 memset( S->buf + S->buflen, 0, 2 * BLAKE2B_BLOCKBYTES - S->buflen ); /* Padding */
266 blake2b_compress( S, S->buf );
267 memcpy( out, &S->h[0], outlen );
268 return 0;
269 }
270
271
blake2b(uint8_t * out,const void * in,const void * key,const uint8_t outlen,const uint64_t inlen,uint8_t keylen)272 int blake2b( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen )
273 {
274 blake2b_state S[1];
275
276 /* Verify parameters */
277 if ( NULL == in ) return -1;
278
279 if ( NULL == out ) return -1;
280
281 if ( NULL == key ) keylen = 0;
282
283 if ( keylen )
284 {
285 if ( blake2b_init_key( S, outlen, key, keylen ) < 0 ) return -1;
286 }
287 else
288 {
289 if ( blake2b_init( S, outlen ) < 0 ) return -1;
290 }
291
292 blake2b_update( S, ( uint8_t * )in, inlen );
293 blake2b_final( S, out, outlen );
294 return 0;
295 }
296
297 #endif
298