1 /*
2 * This code release under the following terms:
3 * No copyright is claimed, and the software is hereby placed in the public domain.
4 * In case this attempt to disclaim copyright and place the software in the public
5 * domain is deemed null and void, then the software is Copyright (c) 2013 JimF
6 * and it is hereby released to the general public under the following
7 * terms: This software may be modified, redistributed, and used for any
8 * purpose, in source and binary forms, with or without modification.
9 *
10 * This new code is 3x to 4x FASTER than the original oSSL code. Even though it is
11 * only using oSSL functions. A lot of the high level stuff in oSSL sux for speed.
12 *
13 * SSE2 intrinsic code, May, 2013, Jim Fougeron.
14 *
15 * skip_bytes means "skip leading output bytes" and can be given in
16 * multiples of underlying hash size (in this case 64). So to calculate only
17 * byte 65-127 (second chunk) you can say "outlen=64 skip_bytes=64"
18 * for a 2x boost. The 1st byte of output array will then be 1st byte of second
19 * chunk so its actual size can be 64 as opposed to 128.
20 */
21
22
23 #include <string.h>
24 #include <stdint.h>
25
26 #include "arch.h"
27 #include "sha2.h"
28 #include "johnswap.h"
29 #include "simd-intrinsics.h"
30
31 #ifndef SHA512_CBLOCK
32 #define SHA512_CBLOCK 128
33 #endif
34 #ifndef SHA512_DIGEST_LENGTH
35 #define SHA512_DIGEST_LENGTH 64
36 #endif
37
38 #if !defined(SIMD_COEF_64) || defined (PBKDF2_HMAC_SHA512_ALSO_INCLUDE_CTX)
39
_pbkdf2_sha512_load_hmac(const unsigned char * K,int KL,SHA512_CTX * pIpad,SHA512_CTX * pOpad)40 static void _pbkdf2_sha512_load_hmac(const unsigned char *K, int KL, SHA512_CTX *pIpad, SHA512_CTX *pOpad) {
41 unsigned char ipad[SHA512_CBLOCK], opad[SHA512_CBLOCK], k0[SHA512_DIGEST_LENGTH];
42 unsigned i;
43
44 memset(ipad, 0x36, SHA512_CBLOCK);
45 memset(opad, 0x5C, SHA512_CBLOCK);
46
47 if (KL > SHA512_CBLOCK) {
48 SHA512_CTX ctx;
49 SHA512_Init( &ctx );
50 SHA512_Update( &ctx, K, KL);
51 SHA512_Final( k0, &ctx);
52 KL = SHA512_DIGEST_LENGTH;
53 K = k0;
54 }
55 for (i = 0; i < KL; i++) {
56 ipad[i] ^= K[i];
57 opad[i] ^= K[i];
58 }
59 // save off the first 1/2 of the ipad/opad hashes. We will NEVER recompute this
60 // again, during the rounds, but reuse it. Saves 1/4 the SHA1's
61 SHA512_Init(pIpad);
62 SHA512_Update(pIpad, ipad, SHA512_CBLOCK);
63 SHA512_Init(pOpad);
64 SHA512_Update(pOpad, opad, SHA512_CBLOCK);
65 }
66
_pbkdf2_sha512(const unsigned char * S,int SL,int R,uint64_t * out,unsigned char loop,const SHA512_CTX * pIpad,const SHA512_CTX * pOpad)67 static void _pbkdf2_sha512(const unsigned char *S, int SL, int R, uint64_t *out,
68 unsigned char loop, const SHA512_CTX *pIpad, const SHA512_CTX *pOpad) {
69 SHA512_CTX ctx;
70 unsigned i, j;
71 unsigned char tmp_hash[SHA512_DIGEST_LENGTH];
72
73 memcpy(&ctx, pIpad, sizeof(SHA512_CTX));
74 SHA512_Update(&ctx, S, SL);
75 // this 4 byte BE 'loop' appended to the salt
76 SHA512_Update(&ctx, "\x0\x0\x0", 3);
77 SHA512_Update(&ctx, &loop, 1);
78 SHA512_Final(tmp_hash, &ctx);
79
80 memcpy(&ctx, pOpad, sizeof(SHA512_CTX));
81 SHA512_Update(&ctx, tmp_hash, SHA512_DIGEST_LENGTH);
82 SHA512_Final(tmp_hash, &ctx);
83
84 memcpy(out, tmp_hash, SHA512_DIGEST_LENGTH);
85
86 for (i = 1; i < R; i++) {
87 #if !defined(COMMON_DIGEST_FOR_OPENSSL)
88 memcpy(&ctx, pIpad, 80);
89 #if defined(__JTR_SHA2___H_)
90 ctx.total = pIpad->total;
91 ctx.bIs512 = pIpad->bIs512;
92 #else
93 ctx.num = pIpad->num;
94 ctx.md_len = pIpad->md_len;
95 #endif
96 #else
97 memcpy(&ctx, pIpad, sizeof(SHA512_CTX));
98 #endif
99 SHA512_Update(&ctx, tmp_hash, SHA512_DIGEST_LENGTH);
100 SHA512_Final(tmp_hash, &ctx);
101
102 #if !defined(COMMON_DIGEST_FOR_OPENSSL)
103 memcpy(&ctx, pOpad, 80);
104 #if defined(__JTR_SHA2___H_)
105 ctx.total = pOpad->total;
106 ctx.bIs512 = pOpad->bIs512;
107 #else
108 ctx.num = pOpad->num;
109 ctx.md_len = pOpad->md_len;
110 #endif
111 #else
112 memcpy(&ctx, pOpad, sizeof(SHA512_CTX));
113 #endif
114 SHA512_Update(&ctx, tmp_hash, SHA512_DIGEST_LENGTH);
115 SHA512_Final(tmp_hash, &ctx);
116
117 for (j = 0; j < SHA512_DIGEST_LENGTH/sizeof(uint64_t); j++) {
118 out[j] ^= ((uint64_t*)tmp_hash)[j];
119 #if defined (DPAPI_CRAP_LOGIC)
120 ((uint64_t*)tmp_hash)[j] = out[j];
121 #endif
122 }
123 }
124 }
125
pbkdf2_sha512(const unsigned char * K,int KL,unsigned char * S,int SL,int R,unsigned char * out,int outlen,int skip_bytes)126 static void pbkdf2_sha512(const unsigned char *K, int KL, unsigned char *S, int SL, int R, unsigned char *out, int outlen, int skip_bytes)
127 {
128 union {
129 uint64_t x64[SHA512_DIGEST_LENGTH/sizeof(uint64_t)];
130 unsigned char out[SHA512_DIGEST_LENGTH];
131 } tmp;
132 int loop, loops, i, accum=0;
133 SHA512_CTX ipad, opad;
134
135 _pbkdf2_sha512_load_hmac(K, KL, &ipad, &opad);
136
137 loops = (skip_bytes + outlen + (SHA512_DIGEST_LENGTH-1)) / SHA512_DIGEST_LENGTH;
138 loop = skip_bytes / SHA512_DIGEST_LENGTH + 1;
139 while (loop <= loops) {
140 _pbkdf2_sha512(S,SL,R,tmp.x64,loop,&ipad,&opad);
141 for (i = skip_bytes%SHA512_DIGEST_LENGTH; i < SHA512_DIGEST_LENGTH && accum < outlen; i++) {
142 out[accum++] = ((uint8_t*)tmp.out)[i];
143 }
144 loop++;
145 skip_bytes = 0;
146 }
147 }
148
149 #endif
150
151 #if defined (SIMD_COEF_64) && !defined(OPENCL_FORMAT)
152
153 #ifndef __JTR_SHA2___H_
154 // we MUST call our sha2.c functions, to know the layout. Since it is possible that apple's CommonCrypto lib could
155 // be used, vs just jts's sha2.c or oSSL, and CommonCrypt is NOT binary compatible, then we MUST use jtr's code here.
156 // To do that, I have the struture defined here (if the header was not included), and the 'real' functions declared here also.
157 typedef struct
158 {
159 uint64_t h[8]; // SHA512 state
160 uint64_t Nl,Nh; // UNUSED but here to be compatible with oSSL
161 unsigned char buffer[128]; // current/building data 'block'. It IS in alignment
162 unsigned int num,md_len; // UNUSED but here to be compatible with oSSL
163 unsigned int total; // number of bytes processed
164 int bIs512; // if 1 SHA512, else SHA224
165 } sha512_ctx;
166 extern void sha512_init (sha512_ctx *ctx, int bIs512);
167 extern void sha512_update (sha512_ctx *ctx, const void *input, int len);
168 extern void sha512_final (void *output, sha512_ctx *ctx);
169 #endif
170
171
172 #if SIMD_PARA_SHA512
173 #define SSE_GROUP_SZ_SHA512 (SIMD_COEF_64*SIMD_PARA_SHA512)
174 #else
175 #error No SIMD_PARA_SHA512 defined
176 #endif
177
_pbkdf2_sha512_sse_load_hmac(const unsigned char * K[SSE_GROUP_SZ_SHA512],int KL[SSE_GROUP_SZ_SHA512],SHA512_CTX pIpad[SSE_GROUP_SZ_SHA512],SHA512_CTX pOpad[SSE_GROUP_SZ_SHA512])178 static void _pbkdf2_sha512_sse_load_hmac(const unsigned char *K[SSE_GROUP_SZ_SHA512], int KL[SSE_GROUP_SZ_SHA512], SHA512_CTX pIpad[SSE_GROUP_SZ_SHA512], SHA512_CTX pOpad[SSE_GROUP_SZ_SHA512])
179 {
180 unsigned char ipad[SHA512_CBLOCK], opad[SHA512_CBLOCK], k0[SHA512_DIGEST_LENGTH];
181 int i, j;
182
183 for (j = 0; j < SSE_GROUP_SZ_SHA512; ++j) {
184 memset(ipad, 0x36, SHA512_CBLOCK);
185 memset(opad, 0x5C, SHA512_CBLOCK);
186
187 if (KL[j] > SHA512_CBLOCK) {
188 SHA512_CTX ctx;
189 SHA512_Init( &ctx );
190 SHA512_Update( &ctx, K[j], KL[j]);
191 SHA512_Final( k0, &ctx);
192 KL[j] = SHA512_DIGEST_LENGTH;
193 K[j] = k0;
194 }
195 for (i = 0; i < KL[j]; i++) {
196 ipad[i] ^= K[j][i];
197 opad[i] ^= K[j][i];
198 }
199 // save off the first 1/2 of the ipad/opad hashes. We will NEVER recompute this
200 // again, during the rounds, but reuse it. Saves 1/4 the SHA512's
201 SHA512_Init(&(pIpad[j]));
202 SHA512_Update(&(pIpad[j]), ipad, SHA512_CBLOCK);
203 SHA512_Init(&(pOpad[j]));
204 SHA512_Update(&(pOpad[j]), opad, SHA512_CBLOCK);
205 }
206 }
207
208 #if defined (SIMD_COEF_64) && !defined(OPENCL_FORMAT) && !(defined PBKDF2_HMAC_SHA512_VARYING_SALT)
pbkdf2_sha512_sse(const unsigned char * K[SSE_GROUP_SZ_SHA512],int KL[SSE_GROUP_SZ_SHA512],unsigned char * S,int SL,int R,unsigned char * out[SSE_GROUP_SZ_SHA512],int outlen,int skip_bytes)209 static void pbkdf2_sha512_sse(const unsigned char *K[SSE_GROUP_SZ_SHA512], int KL[SSE_GROUP_SZ_SHA512], unsigned char *S, int SL, int R, unsigned char *out[SSE_GROUP_SZ_SHA512], int outlen, int skip_bytes)
210 {
211 unsigned char tmp_hash[SHA512_DIGEST_LENGTH];
212 uint64_t *i1, *i2, *o1, *ptmp;
213 unsigned int i, j;
214 uint64_t dgst[SSE_GROUP_SZ_SHA512][SHA512_DIGEST_LENGTH/sizeof(uint64_t)];
215 int loops, accum=0;
216 unsigned char loop;
217 SHA512_CTX ipad[SSE_GROUP_SZ_SHA512], opad[SSE_GROUP_SZ_SHA512], ctx;
218
219 // sse_hash1 would need to be 'adjusted' for SHA512_PARA
220 JTR_ALIGN(MEM_ALIGN_SIMD) unsigned char sse_hash1[SHA_BUF_SIZ*sizeof(uint64_t)*SSE_GROUP_SZ_SHA512];
221 JTR_ALIGN(MEM_ALIGN_SIMD) unsigned char sse_crypt1[SHA512_DIGEST_LENGTH*SSE_GROUP_SZ_SHA512];
222 JTR_ALIGN(MEM_ALIGN_SIMD) unsigned char sse_crypt2[SHA512_DIGEST_LENGTH*SSE_GROUP_SZ_SHA512];
223 i1 = (uint64_t*)sse_crypt1;
224 i2 = (uint64_t*)sse_crypt2;
225 o1 = (uint64_t*)sse_hash1;
226
227 // we need to set ONE time, the upper half of the data buffer. We put the 0x80 byte (in BE format), at offset 64,
228 // then zero out the rest of the buffer, putting 0x300 (#bits), into the proper location in the buffer. Once this
229 // part of the buffer is setup, we never touch it again, for the rest of the crypt. We simply overwrite the first
230 // half of this buffer, over and over again, with BE results of the prior hash.
231 for (j = 0; j < SSE_GROUP_SZ_SHA512/SIMD_COEF_64; ++j) {
232 ptmp = &o1[j*SIMD_COEF_64*SHA_BUF_SIZ];
233 for (i = 0; i < SIMD_COEF_64; ++i)
234 ptmp[ (SHA512_DIGEST_LENGTH/sizeof(uint64_t))*SIMD_COEF_64 + (i&(SIMD_COEF_64-1))] = 0x8000000000000000ULL;
235 for (i = (SHA512_DIGEST_LENGTH/sizeof(uint64_t)+1)*SIMD_COEF_64; i < 15*SIMD_COEF_64; ++i)
236 ptmp[i] = 0;
237 for (i = 0; i < SIMD_COEF_64; ++i)
238 ptmp[15*SIMD_COEF_64 + (i&(SIMD_COEF_64-1))] = ((128+SHA512_DIGEST_LENGTH)<<3); // all encrypts are 128+64 bytes.
239 }
240
241 // Load up the IPAD and OPAD values, saving off the first half of the crypt. We then push the ipad/opad all
242 // the way to the end, and that ends up being the first iteration of the pbkdf2. From that point on, we use
243 // the 2 first halves, to load the sha512 2nd part of each crypt, in each loop.
244 _pbkdf2_sha512_sse_load_hmac(K, KL, ipad, opad);
245 for (j = 0; j < SSE_GROUP_SZ_SHA512; ++j) {
246 ptmp = &i1[(j/SIMD_COEF_64)*SIMD_COEF_64*(SHA512_DIGEST_LENGTH/sizeof(uint64_t))+(j&(SIMD_COEF_64-1))];
247 for (i = 0; i < (SHA512_DIGEST_LENGTH/sizeof(uint64_t)); ++i) {
248 #if COMMON_DIGEST_FOR_OPENSSL
249 *ptmp = ipad[j].hash[i];
250 #else
251 *ptmp = ipad[j].h[i];
252 #endif
253 ptmp += SIMD_COEF_64;
254 }
255 ptmp = &i2[(j/SIMD_COEF_64)*SIMD_COEF_64*(SHA512_DIGEST_LENGTH/sizeof(uint64_t))+(j&(SIMD_COEF_64-1))];
256 for (i = 0; i < (SHA512_DIGEST_LENGTH/sizeof(uint64_t)); ++i) {
257 #if COMMON_DIGEST_FOR_OPENSSL
258 *ptmp = opad[j].hash[i];
259 #else
260 *ptmp = opad[j].h[i];
261 #endif
262 ptmp += SIMD_COEF_64;
263 }
264 }
265
266 loops = (skip_bytes + outlen + (SHA512_DIGEST_LENGTH-1)) / SHA512_DIGEST_LENGTH;
267 loop = skip_bytes / SHA512_DIGEST_LENGTH + 1;
268 while (loop <= loops) {
269 for (j = 0; j < SSE_GROUP_SZ_SHA512; ++j) {
270 memcpy(&ctx, &ipad[j], sizeof(ctx));
271 SHA512_Update(&ctx, S, SL);
272 // this BE 1 appended to the salt, allows us to do passwords up
273 // to and including 128 bytes long. If we wanted longer passwords,
274 // then we would have to call the HMAC multiple times (with the
275 // rounds between, but each chunk of password we would use a larger
276 // BE number appended to the salt. The first roung (64 byte pw), and
277 // we simply append the first number (0001 in BE)
278 SHA512_Update(&ctx, "\x0\x0\x0", 3);
279 SHA512_Update(&ctx, &loop, 1);
280 SHA512_Final(tmp_hash, &ctx);
281
282 memcpy(&ctx, &opad[j], sizeof(ctx));
283 SHA512_Update(&ctx, tmp_hash, SHA512_DIGEST_LENGTH);
284 SHA512_Final(tmp_hash, &ctx);
285
286 // now convert this from flat into SIMD_COEF_64 buffers.
287 // Also, perform the 'first' ^= into the crypt buffer. NOTE, we are doing that in BE format
288 // so we will need to 'undo' that in the end.
289 ptmp = &o1[(j/SIMD_COEF_64)*SIMD_COEF_64*SHA_BUF_SIZ+(j&(SIMD_COEF_64-1))];
290 for (i = 0; i < (SHA512_DIGEST_LENGTH/sizeof(uint64_t)); ++i) {
291 #if COMMON_DIGEST_FOR_OPENSSL
292 *ptmp = dgst[j][i] = ctx.hash[i];
293 #else
294 *ptmp = dgst[j][i] = ctx.h[i];
295 #endif
296 ptmp += SIMD_COEF_64;
297 }
298 }
299
300 // Here is the inner loop. We loop from 1 to count. iteration 0 was done in the ipad/opad computation.
301 for (i = 1; i < R; i++) {
302 unsigned int k;
303 SIMDSHA512body(o1,o1,i1, SSEi_MIXED_IN|SSEi_RELOAD|SSEi_OUTPUT_AS_INP_FMT);
304 SIMDSHA512body(o1,o1,i2, SSEi_MIXED_IN|SSEi_RELOAD|SSEi_OUTPUT_AS_INP_FMT);
305 // only xor first 16 64-bit words
306 for (k = 0; k < SSE_GROUP_SZ_SHA512; k++) {
307 uint64_t *p = &o1[(k/SIMD_COEF_64)*SIMD_COEF_64*SHA_BUF_SIZ + (k&(SIMD_COEF_64-1))];
308 for (j = 0; j < (SHA512_DIGEST_LENGTH/sizeof(uint64_t)); j++) {
309 dgst[k][j] ^= p[j*SIMD_COEF_64];
310 #if defined (DPAPI_CRAP_LOGIC)
311 p[(j*SIMD_COEF_64)] = dgst[k][j];
312 #endif
313 }
314 }
315 }
316
317 // we must fixup final results. We have been working in BE (NOT switching out of, just to switch back into it at every loop).
318 // for the 'very' end of the crypt, we remove BE logic, so the calling function can view it in native format.
319 alter_endianity_to_BE64(dgst, sizeof(dgst)/8);
320 for (i = skip_bytes%SHA512_DIGEST_LENGTH; i < SHA512_DIGEST_LENGTH && accum < outlen; ++i) {
321 for (j = 0; j < SSE_GROUP_SZ_SHA512; ++j) {
322 out[j][accum] = ((unsigned char*)(dgst[j]))[i];
323 }
324 ++accum;
325 }
326 ++loop;
327 skip_bytes = 0;
328 }
329 }
330 #endif
331
332 #if defined (PBKDF2_HMAC_SHA512_VARYING_SALT)
pbkdf2_sha512_sse_varying_salt(const unsigned char * K[SSE_GROUP_SZ_SHA512],int KL[SSE_GROUP_SZ_SHA512],unsigned char * S[SSE_GROUP_SZ_SHA512],int SL[SSE_GROUP_SZ_SHA512],int R,unsigned char * out[SSE_GROUP_SZ_SHA512],int outlen,int skip_bytes)333 static void pbkdf2_sha512_sse_varying_salt(const unsigned char *K[SSE_GROUP_SZ_SHA512], int KL[SSE_GROUP_SZ_SHA512], unsigned char *S[SSE_GROUP_SZ_SHA512], int SL[SSE_GROUP_SZ_SHA512], int R, unsigned char *out[SSE_GROUP_SZ_SHA512], int outlen, int skip_bytes)
334 {
335 unsigned char tmp_hash[SHA512_DIGEST_LENGTH];
336 uint64_t *i1, *i2, *o1, *ptmp;
337 unsigned int i, j;
338 uint64_t dgst[SSE_GROUP_SZ_SHA512][SHA512_DIGEST_LENGTH/sizeof(uint64_t)];
339 int loops, accum=0;
340 unsigned char loop;
341 SHA512_CTX ipad[SSE_GROUP_SZ_SHA512], opad[SSE_GROUP_SZ_SHA512], ctx;
342
343 // sse_hash1 would need to be 'adjusted' for SHA512_PARA
344 JTR_ALIGN(MEM_ALIGN_SIMD) unsigned char sse_hash1[SHA_BUF_SIZ*sizeof(uint64_t)*SSE_GROUP_SZ_SHA512];
345 JTR_ALIGN(MEM_ALIGN_SIMD) unsigned char sse_crypt1[SHA512_DIGEST_LENGTH*SSE_GROUP_SZ_SHA512];
346 JTR_ALIGN(MEM_ALIGN_SIMD) unsigned char sse_crypt2[SHA512_DIGEST_LENGTH*SSE_GROUP_SZ_SHA512];
347 i1 = (uint64_t*)sse_crypt1;
348 i2 = (uint64_t*)sse_crypt2;
349 o1 = (uint64_t*)sse_hash1;
350
351 // we need to set ONE time, the upper half of the data buffer. We put the 0x80 byte (in BE format), at offset 64,
352 // then zero out the rest of the buffer, putting 0x300 (#bits), into the proper location in the buffer. Once this
353 // part of the buffer is setup, we never touch it again, for the rest of the crypt. We simply overwrite the first
354 // half of this buffer, over and over again, with BE results of the prior hash.
355 for (j = 0; j < SSE_GROUP_SZ_SHA512/SIMD_COEF_64; ++j) {
356 ptmp = &o1[j*SIMD_COEF_64*SHA_BUF_SIZ];
357 for (i = 0; i < SIMD_COEF_64; ++i)
358 ptmp[ (SHA512_DIGEST_LENGTH/sizeof(uint64_t))*SIMD_COEF_64 + (i&(SIMD_COEF_64-1))] = 0x8000000000000000ULL;
359 for (i = (SHA512_DIGEST_LENGTH/sizeof(uint64_t)+1)*SIMD_COEF_64; i < 15*SIMD_COEF_64; ++i)
360 ptmp[i] = 0;
361 for (i = 0; i < SIMD_COEF_64; ++i)
362 ptmp[15*SIMD_COEF_64 + (i&(SIMD_COEF_64-1))] = ((128+SHA512_DIGEST_LENGTH)<<3); // all encrypts are 128+64 bytes.
363 }
364
365 // Load up the IPAD and OPAD values, saving off the first half of the crypt. We then push the ipad/opad all
366 // the way to the end, and that ends up being the first iteration of the pbkdf2. From that point on, we use
367 // the 2 first halves, to load the sha512 2nd part of each crypt, in each loop.
368 _pbkdf2_sha512_sse_load_hmac(K, KL, ipad, opad);
369 for (j = 0; j < SSE_GROUP_SZ_SHA512; ++j) {
370 ptmp = &i1[(j/SIMD_COEF_64)*SIMD_COEF_64*(SHA512_DIGEST_LENGTH/sizeof(uint64_t))+(j&(SIMD_COEF_64-1))];
371 for (i = 0; i < (SHA512_DIGEST_LENGTH/sizeof(uint64_t)); ++i) {
372 #if COMMON_DIGEST_FOR_OPENSSL
373 *ptmp = ipad[j].hash[i];
374 #else
375 *ptmp = ipad[j].h[i];
376 #endif
377 ptmp += SIMD_COEF_64;
378 }
379 ptmp = &i2[(j/SIMD_COEF_64)*SIMD_COEF_64*(SHA512_DIGEST_LENGTH/sizeof(uint64_t))+(j&(SIMD_COEF_64-1))];
380 for (i = 0; i < (SHA512_DIGEST_LENGTH/sizeof(uint64_t)); ++i) {
381 #if COMMON_DIGEST_FOR_OPENSSL
382 *ptmp = opad[j].hash[i];
383 #else
384 *ptmp = opad[j].h[i];
385 #endif
386 ptmp += SIMD_COEF_64;
387 }
388 }
389
390 loops = (skip_bytes + outlen + (SHA512_DIGEST_LENGTH-1)) / SHA512_DIGEST_LENGTH;
391 loop = skip_bytes / SHA512_DIGEST_LENGTH + 1;
392 while (loop <= loops) {
393 for (j = 0; j < SSE_GROUP_SZ_SHA512; ++j) {
394 memcpy(&ctx, &ipad[j], sizeof(ctx));
395 SHA512_Update(&ctx, S[j], SL[j]);
396 // this BE 1 appended to the salt, allows us to do passwords up
397 // to and including 128 bytes long. If we wanted longer passwords,
398 // then we would have to call the HMAC multiple times (with the
399 // rounds between, but each chunk of password we would use a larger
400 // BE number appended to the salt. The first roung (64 byte pw), and
401 // we simply append the first number (0001 in BE)
402 SHA512_Update(&ctx, "\x0\x0\x0", 3);
403 SHA512_Update(&ctx, &loop, 1);
404 SHA512_Final(tmp_hash, &ctx);
405
406 memcpy(&ctx, &opad[j], sizeof(ctx));
407 SHA512_Update(&ctx, tmp_hash, SHA512_DIGEST_LENGTH);
408 SHA512_Final(tmp_hash, &ctx);
409
410 // now convert this from flat into SIMD_COEF_64 buffers.
411 // Also, perform the 'first' ^= into the crypt buffer. NOTE, we are doing that in BE format
412 // so we will need to 'undo' that in the end.
413 ptmp = &o1[(j/SIMD_COEF_64)*SIMD_COEF_64*SHA_BUF_SIZ+(j&(SIMD_COEF_64-1))];
414 for (i = 0; i < (SHA512_DIGEST_LENGTH/sizeof(uint64_t)); ++i) {
415 #if COMMON_DIGEST_FOR_OPENSSL
416 *ptmp = dgst[j][i] = ctx.hash[i];
417 #else
418 *ptmp = dgst[j][i] = ctx.h[i];
419 #endif
420 ptmp += SIMD_COEF_64;
421 }
422 }
423
424 // Here is the inner loop. We loop from 1 to count. iteration 0 was done in the ipad/opad computation.
425 for (i = 1; i < R; i++) {
426 unsigned int k;
427 SIMDSHA512body(o1,o1,i1, SSEi_MIXED_IN|SSEi_RELOAD|SSEi_OUTPUT_AS_INP_FMT);
428 SIMDSHA512body(o1,o1,i2, SSEi_MIXED_IN|SSEi_RELOAD|SSEi_OUTPUT_AS_INP_FMT);
429 // only xor first 16 64-bit words
430 for (k = 0; k < SSE_GROUP_SZ_SHA512; k++) {
431 uint64_t *p = &o1[(k/SIMD_COEF_64)*SIMD_COEF_64*SHA_BUF_SIZ + (k&(SIMD_COEF_64-1))];
432 for (j = 0; j < (SHA512_DIGEST_LENGTH/sizeof(uint64_t)); j++) {
433 dgst[k][j] ^= p[j*SIMD_COEF_64];
434 #if defined (DPAPI_CRAP_LOGIC)
435 p[(j*SIMD_COEF_64)] = dgst[k][j];
436 #endif
437 }
438 }
439 }
440
441 // we must fixup final results. We have been working in BE (NOT switching out of, just to switch back into it at every loop).
442 // for the 'very' end of the crypt, we remove BE logic, so the calling function can view it in native format.
443 alter_endianity_to_BE64(dgst, sizeof(dgst)/8);
444 for (i = skip_bytes%SHA512_DIGEST_LENGTH; i < SHA512_DIGEST_LENGTH && accum < outlen; ++i) {
445 for (j = 0; j < SSE_GROUP_SZ_SHA512; ++j) {
446 out[j][accum] = ((unsigned char*)(dgst[j]))[i];
447 }
448 ++accum;
449 }
450 ++loop;
451 skip_bytes = 0;
452 }
453 }
454
455 #endif
456
457 #endif
458