1 /*
2  * This code release under the following terms:
3  * No copyright is claimed, and the software is hereby placed in the public domain.
4  * In case this attempt to disclaim copyright and place the software in the public
5  * domain is deemed null and void, then the software is Copyright (c) 2013 JimF
6  * and it is hereby released to the general public under the following
7  * terms: This software may be modified, redistributed, and used for any
8  * purpose, in source and binary forms, with or without modification.
9  *
10  * This new code is 3x to 4x FASTER than the original oSSL code. Even though it is
11  * only using oSSL functions.  A lot of the high level stuff in oSSL sux for speed.
12  *
13  * SSE2 intrinsic code, May, 2013, Jim Fougeron.
14  *
15  * skip_bytes means "skip leading output bytes" and can be given in
16  * multiples of underlying hash size (in this case 64). So to calculate only
17  * byte 65-127 (second chunk) you can say "outlen=64 skip_bytes=64"
18  * for a 2x boost. The 1st byte of output array will then be 1st byte of second
19  * chunk so its actual size can be 64 as opposed to 128.
20  */
21 
22 
23 #include <string.h>
24 #include <stdint.h>
25 
26 #include "arch.h"
27 #include "sha2.h"
28 #include "johnswap.h"
29 #include "simd-intrinsics.h"
30 
31 #ifndef SHA512_CBLOCK
32 #define SHA512_CBLOCK 128
33 #endif
34 #ifndef SHA512_DIGEST_LENGTH
35 #define SHA512_DIGEST_LENGTH 64
36 #endif
37 
38 #if !defined(SIMD_COEF_64) || defined (PBKDF2_HMAC_SHA512_ALSO_INCLUDE_CTX)
39 
_pbkdf2_sha512_load_hmac(const unsigned char * K,int KL,SHA512_CTX * pIpad,SHA512_CTX * pOpad)40 static void _pbkdf2_sha512_load_hmac(const unsigned char *K, int KL, SHA512_CTX *pIpad, SHA512_CTX *pOpad) {
41 	unsigned char ipad[SHA512_CBLOCK], opad[SHA512_CBLOCK], k0[SHA512_DIGEST_LENGTH];
42 	unsigned i;
43 
44 	memset(ipad, 0x36, SHA512_CBLOCK);
45 	memset(opad, 0x5C, SHA512_CBLOCK);
46 
47 	if (KL > SHA512_CBLOCK) {
48 		SHA512_CTX ctx;
49 		SHA512_Init( &ctx );
50 		SHA512_Update( &ctx, K, KL);
51 		SHA512_Final( k0, &ctx);
52 		KL = SHA512_DIGEST_LENGTH;
53 		K = k0;
54 	}
55 	for (i = 0; i < KL; i++) {
56 		ipad[i] ^= K[i];
57 		opad[i] ^= K[i];
58 	}
59 	// save off the first 1/2 of the ipad/opad hashes.  We will NEVER recompute this
60 	// again, during the rounds, but reuse it. Saves 1/4 the SHA1's
61 	SHA512_Init(pIpad);
62 	SHA512_Update(pIpad, ipad, SHA512_CBLOCK);
63 	SHA512_Init(pOpad);
64 	SHA512_Update(pOpad, opad, SHA512_CBLOCK);
65 }
66 
_pbkdf2_sha512(const unsigned char * S,int SL,int R,uint64_t * out,unsigned char loop,const SHA512_CTX * pIpad,const SHA512_CTX * pOpad)67 static void _pbkdf2_sha512(const unsigned char *S, int SL, int R, uint64_t *out,
68 	                     unsigned char loop, const SHA512_CTX *pIpad, const SHA512_CTX *pOpad) {
69 	SHA512_CTX ctx;
70 	unsigned i, j;
71 	unsigned char tmp_hash[SHA512_DIGEST_LENGTH];
72 
73 	memcpy(&ctx, pIpad, sizeof(SHA512_CTX));
74 	SHA512_Update(&ctx, S, SL);
75 	// this 4 byte BE 'loop' appended to the salt
76 	SHA512_Update(&ctx, "\x0\x0\x0", 3);
77 	SHA512_Update(&ctx, &loop, 1);
78 	SHA512_Final(tmp_hash, &ctx);
79 
80 	memcpy(&ctx, pOpad, sizeof(SHA512_CTX));
81 	SHA512_Update(&ctx, tmp_hash, SHA512_DIGEST_LENGTH);
82 	SHA512_Final(tmp_hash, &ctx);
83 
84 	memcpy(out, tmp_hash, SHA512_DIGEST_LENGTH);
85 
86 	for (i = 1; i < R; i++) {
87 #if !defined(COMMON_DIGEST_FOR_OPENSSL)
88 		memcpy(&ctx, pIpad, 80);
89 #if defined(__JTR_SHA2___H_)
90 		ctx.total = pIpad->total;
91 		ctx.bIs512 = pIpad->bIs512;
92 #else
93 		ctx.num = pIpad->num;
94 		ctx.md_len = pIpad->md_len;
95 #endif
96 #else
97 		memcpy(&ctx, pIpad, sizeof(SHA512_CTX));
98 #endif
99 		SHA512_Update(&ctx, tmp_hash, SHA512_DIGEST_LENGTH);
100 		SHA512_Final(tmp_hash, &ctx);
101 
102 #if !defined(COMMON_DIGEST_FOR_OPENSSL)
103 		memcpy(&ctx, pOpad, 80);
104 #if defined(__JTR_SHA2___H_)
105 		ctx.total = pOpad->total;
106 		ctx.bIs512 = pOpad->bIs512;
107 #else
108 		ctx.num = pOpad->num;
109 		ctx.md_len = pOpad->md_len;
110 #endif
111 #else
112 		memcpy(&ctx, pOpad, sizeof(SHA512_CTX));
113 #endif
114 		SHA512_Update(&ctx, tmp_hash, SHA512_DIGEST_LENGTH);
115 		SHA512_Final(tmp_hash, &ctx);
116 
117 		for (j = 0; j < SHA512_DIGEST_LENGTH/sizeof(uint64_t); j++) {
118 			out[j] ^= ((uint64_t*)tmp_hash)[j];
119 #if defined (DPAPI_CRAP_LOGIC)
120 			((uint64_t*)tmp_hash)[j] = out[j];
121 #endif
122 		}
123 	}
124 }
125 
pbkdf2_sha512(const unsigned char * K,int KL,unsigned char * S,int SL,int R,unsigned char * out,int outlen,int skip_bytes)126 static void pbkdf2_sha512(const unsigned char *K, int KL, unsigned char *S, int SL, int R, unsigned char *out, int outlen, int skip_bytes)
127 {
128 	union {
129 		uint64_t x64[SHA512_DIGEST_LENGTH/sizeof(uint64_t)];
130 		unsigned char out[SHA512_DIGEST_LENGTH];
131 	} tmp;
132 	int loop, loops, i, accum=0;
133 	SHA512_CTX ipad, opad;
134 
135 	_pbkdf2_sha512_load_hmac(K, KL, &ipad, &opad);
136 
137 	loops = (skip_bytes + outlen + (SHA512_DIGEST_LENGTH-1)) / SHA512_DIGEST_LENGTH;
138 	loop = skip_bytes / SHA512_DIGEST_LENGTH + 1;
139 	while (loop <= loops) {
140 		_pbkdf2_sha512(S,SL,R,tmp.x64,loop,&ipad,&opad);
141 		for (i = skip_bytes%SHA512_DIGEST_LENGTH; i < SHA512_DIGEST_LENGTH && accum < outlen; i++) {
142 			out[accum++] = ((uint8_t*)tmp.out)[i];
143 		}
144 		loop++;
145 		skip_bytes = 0;
146 	}
147 }
148 
149 #endif
150 
151 #if defined (SIMD_COEF_64) && !defined(OPENCL_FORMAT)
152 
153 #ifndef __JTR_SHA2___H_
154 // we MUST call our sha2.c functions, to know the layout.  Since it is possible that apple's CommonCrypto lib could
155 // be used, vs just jts's sha2.c or oSSL, and CommonCrypt is NOT binary compatible, then we MUST use jtr's code here.
156 // To do that, I have the struture defined here (if the header was not included), and the 'real' functions declared here also.
157 typedef struct
158 {
159 	uint64_t h[8];          // SHA512 state
160 	uint64_t Nl,Nh;         // UNUSED but here to be compatible with oSSL
161 	unsigned char buffer[128];  // current/building data 'block'. It IS in alignment
162 	unsigned int num,md_len;    // UNUSED but here to be compatible with oSSL
163 	unsigned int total;         // number of bytes processed
164 	int bIs512;                 // if 1 SHA512, else SHA224
165 } sha512_ctx;
166 extern void sha512_init   (sha512_ctx *ctx, int bIs512);
167 extern void sha512_update (sha512_ctx *ctx, const void *input, int len);
168 extern void sha512_final  (void *output, sha512_ctx *ctx);
169 #endif
170 
171 
172 #if SIMD_PARA_SHA512
173 #define SSE_GROUP_SZ_SHA512 (SIMD_COEF_64*SIMD_PARA_SHA512)
174 #else
175 #error No SIMD_PARA_SHA512 defined
176 #endif
177 
_pbkdf2_sha512_sse_load_hmac(const unsigned char * K[SSE_GROUP_SZ_SHA512],int KL[SSE_GROUP_SZ_SHA512],SHA512_CTX pIpad[SSE_GROUP_SZ_SHA512],SHA512_CTX pOpad[SSE_GROUP_SZ_SHA512])178 static void _pbkdf2_sha512_sse_load_hmac(const unsigned char *K[SSE_GROUP_SZ_SHA512], int KL[SSE_GROUP_SZ_SHA512], SHA512_CTX pIpad[SSE_GROUP_SZ_SHA512], SHA512_CTX pOpad[SSE_GROUP_SZ_SHA512])
179 {
180 	unsigned char ipad[SHA512_CBLOCK], opad[SHA512_CBLOCK], k0[SHA512_DIGEST_LENGTH];
181 	int i, j;
182 
183 	for (j = 0; j < SSE_GROUP_SZ_SHA512; ++j) {
184 		memset(ipad, 0x36, SHA512_CBLOCK);
185 		memset(opad, 0x5C, SHA512_CBLOCK);
186 
187 		if (KL[j] > SHA512_CBLOCK) {
188 			SHA512_CTX ctx;
189 			SHA512_Init( &ctx );
190 			SHA512_Update( &ctx, K[j], KL[j]);
191 			SHA512_Final( k0, &ctx);
192 			KL[j] = SHA512_DIGEST_LENGTH;
193 			K[j] = k0;
194 		}
195 		for (i = 0; i < KL[j]; i++) {
196 			ipad[i] ^= K[j][i];
197 			opad[i] ^= K[j][i];
198 		}
199 		// save off the first 1/2 of the ipad/opad hashes.  We will NEVER recompute this
200 		// again, during the rounds, but reuse it. Saves 1/4 the SHA512's
201 		SHA512_Init(&(pIpad[j]));
202 		SHA512_Update(&(pIpad[j]), ipad, SHA512_CBLOCK);
203 		SHA512_Init(&(pOpad[j]));
204 		SHA512_Update(&(pOpad[j]), opad, SHA512_CBLOCK);
205 	}
206 }
207 
208 #if defined (SIMD_COEF_64) && !defined(OPENCL_FORMAT) && !(defined PBKDF2_HMAC_SHA512_VARYING_SALT)
pbkdf2_sha512_sse(const unsigned char * K[SSE_GROUP_SZ_SHA512],int KL[SSE_GROUP_SZ_SHA512],unsigned char * S,int SL,int R,unsigned char * out[SSE_GROUP_SZ_SHA512],int outlen,int skip_bytes)209 static void pbkdf2_sha512_sse(const unsigned char *K[SSE_GROUP_SZ_SHA512], int KL[SSE_GROUP_SZ_SHA512], unsigned char *S, int SL, int R, unsigned char *out[SSE_GROUP_SZ_SHA512], int outlen, int skip_bytes)
210 {
211 	unsigned char tmp_hash[SHA512_DIGEST_LENGTH];
212 	uint64_t *i1, *i2, *o1, *ptmp;
213 	unsigned int i, j;
214 	uint64_t dgst[SSE_GROUP_SZ_SHA512][SHA512_DIGEST_LENGTH/sizeof(uint64_t)];
215 	int loops, accum=0;
216 	unsigned char loop;
217 	SHA512_CTX ipad[SSE_GROUP_SZ_SHA512], opad[SSE_GROUP_SZ_SHA512], ctx;
218 
219 	// sse_hash1 would need to be 'adjusted' for SHA512_PARA
220 	JTR_ALIGN(MEM_ALIGN_SIMD) unsigned char sse_hash1[SHA_BUF_SIZ*sizeof(uint64_t)*SSE_GROUP_SZ_SHA512];
221 	JTR_ALIGN(MEM_ALIGN_SIMD) unsigned char sse_crypt1[SHA512_DIGEST_LENGTH*SSE_GROUP_SZ_SHA512];
222 	JTR_ALIGN(MEM_ALIGN_SIMD) unsigned char sse_crypt2[SHA512_DIGEST_LENGTH*SSE_GROUP_SZ_SHA512];
223 	i1 = (uint64_t*)sse_crypt1;
224 	i2 = (uint64_t*)sse_crypt2;
225 	o1 = (uint64_t*)sse_hash1;
226 
227 	// we need to set ONE time, the upper half of the data buffer.  We put the 0x80 byte (in BE format), at offset 64,
228 	// then zero out the rest of the buffer, putting 0x300 (#bits), into the proper location in the buffer.  Once this
229 	// part of the buffer is setup, we never touch it again, for the rest of the crypt.  We simply overwrite the first
230 	// half of this buffer, over and over again, with BE results of the prior hash.
231 	for (j = 0; j < SSE_GROUP_SZ_SHA512/SIMD_COEF_64; ++j) {
232 		ptmp = &o1[j*SIMD_COEF_64*SHA_BUF_SIZ];
233 		for (i = 0; i < SIMD_COEF_64; ++i)
234 			ptmp[ (SHA512_DIGEST_LENGTH/sizeof(uint64_t))*SIMD_COEF_64 + (i&(SIMD_COEF_64-1))] = 0x8000000000000000ULL;
235 		for (i = (SHA512_DIGEST_LENGTH/sizeof(uint64_t)+1)*SIMD_COEF_64; i < 15*SIMD_COEF_64; ++i)
236 			ptmp[i] = 0;
237 		for (i = 0; i < SIMD_COEF_64; ++i)
238 			ptmp[15*SIMD_COEF_64 + (i&(SIMD_COEF_64-1))] = ((128+SHA512_DIGEST_LENGTH)<<3); // all encrypts are 128+64 bytes.
239 	}
240 
241 	// Load up the IPAD and OPAD values, saving off the first half of the crypt.  We then push the ipad/opad all
242 	// the way to the end, and that ends up being the first iteration of the pbkdf2.  From that point on, we use
243 	// the 2 first halves, to load the sha512 2nd part of each crypt, in each loop.
244 	_pbkdf2_sha512_sse_load_hmac(K, KL, ipad, opad);
245 	for (j = 0; j < SSE_GROUP_SZ_SHA512; ++j) {
246 		ptmp = &i1[(j/SIMD_COEF_64)*SIMD_COEF_64*(SHA512_DIGEST_LENGTH/sizeof(uint64_t))+(j&(SIMD_COEF_64-1))];
247 		for (i = 0; i < (SHA512_DIGEST_LENGTH/sizeof(uint64_t)); ++i) {
248 #if COMMON_DIGEST_FOR_OPENSSL
249 			*ptmp = ipad[j].hash[i];
250 #else
251 			*ptmp = ipad[j].h[i];
252 #endif
253 			ptmp += SIMD_COEF_64;
254 		}
255 		ptmp = &i2[(j/SIMD_COEF_64)*SIMD_COEF_64*(SHA512_DIGEST_LENGTH/sizeof(uint64_t))+(j&(SIMD_COEF_64-1))];
256 		for (i = 0; i < (SHA512_DIGEST_LENGTH/sizeof(uint64_t)); ++i) {
257 #if COMMON_DIGEST_FOR_OPENSSL
258 			*ptmp = opad[j].hash[i];
259 #else
260 			*ptmp = opad[j].h[i];
261 #endif
262 			ptmp += SIMD_COEF_64;
263 		}
264 	}
265 
266 	loops = (skip_bytes + outlen + (SHA512_DIGEST_LENGTH-1)) / SHA512_DIGEST_LENGTH;
267 	loop = skip_bytes / SHA512_DIGEST_LENGTH + 1;
268 	while (loop <= loops) {
269 		for (j = 0; j < SSE_GROUP_SZ_SHA512; ++j) {
270 			memcpy(&ctx, &ipad[j], sizeof(ctx));
271 			SHA512_Update(&ctx, S, SL);
272 			// this BE 1 appended to the salt, allows us to do passwords up
273 			// to and including 128 bytes long.  If we wanted longer passwords,
274 			// then we would have to call the HMAC multiple times (with the
275 			// rounds between, but each chunk of password we would use a larger
276 			// BE number appended to the salt. The first roung (64 byte pw), and
277 			// we simply append the first number (0001 in BE)
278 			SHA512_Update(&ctx, "\x0\x0\x0", 3);
279 			SHA512_Update(&ctx, &loop, 1);
280 			SHA512_Final(tmp_hash, &ctx);
281 
282 			memcpy(&ctx, &opad[j], sizeof(ctx));
283 			SHA512_Update(&ctx, tmp_hash, SHA512_DIGEST_LENGTH);
284 			SHA512_Final(tmp_hash, &ctx);
285 
286 			// now convert this from flat into SIMD_COEF_64 buffers.
287 			// Also, perform the 'first' ^= into the crypt buffer.  NOTE, we are doing that in BE format
288 			// so we will need to 'undo' that in the end.
289 			ptmp = &o1[(j/SIMD_COEF_64)*SIMD_COEF_64*SHA_BUF_SIZ+(j&(SIMD_COEF_64-1))];
290 			for (i = 0; i < (SHA512_DIGEST_LENGTH/sizeof(uint64_t)); ++i) {
291 #if COMMON_DIGEST_FOR_OPENSSL
292 				*ptmp = dgst[j][i] = ctx.hash[i];
293 #else
294 				*ptmp = dgst[j][i] = ctx.h[i];
295 #endif
296 				ptmp += SIMD_COEF_64;
297 			}
298 		}
299 
300 		// Here is the inner loop.  We loop from 1 to count.  iteration 0 was done in the ipad/opad computation.
301 		for (i = 1; i < R; i++) {
302 			unsigned int k;
303 			SIMDSHA512body(o1,o1,i1, SSEi_MIXED_IN|SSEi_RELOAD|SSEi_OUTPUT_AS_INP_FMT);
304 			SIMDSHA512body(o1,o1,i2, SSEi_MIXED_IN|SSEi_RELOAD|SSEi_OUTPUT_AS_INP_FMT);
305 			// only xor first 16 64-bit words
306 			for (k = 0; k < SSE_GROUP_SZ_SHA512; k++) {
307 				uint64_t *p = &o1[(k/SIMD_COEF_64)*SIMD_COEF_64*SHA_BUF_SIZ + (k&(SIMD_COEF_64-1))];
308 				for (j = 0; j < (SHA512_DIGEST_LENGTH/sizeof(uint64_t)); j++) {
309 					dgst[k][j] ^= p[j*SIMD_COEF_64];
310 #if defined (DPAPI_CRAP_LOGIC)
311 					p[(j*SIMD_COEF_64)] = dgst[k][j];
312 #endif
313 				}
314 			}
315 		}
316 
317 		// we must fixup final results.  We have been working in BE (NOT switching out of, just to switch back into it at every loop).
318 		// for the 'very' end of the crypt, we remove BE logic, so the calling function can view it in native format.
319 		alter_endianity_to_BE64(dgst, sizeof(dgst)/8);
320 		for (i = skip_bytes%SHA512_DIGEST_LENGTH; i < SHA512_DIGEST_LENGTH && accum < outlen; ++i) {
321 			for (j = 0; j < SSE_GROUP_SZ_SHA512; ++j) {
322 				out[j][accum] = ((unsigned char*)(dgst[j]))[i];
323 			}
324 			++accum;
325 		}
326 		++loop;
327 		skip_bytes = 0;
328 	}
329 }
330 #endif
331 
332 #if defined (PBKDF2_HMAC_SHA512_VARYING_SALT)
pbkdf2_sha512_sse_varying_salt(const unsigned char * K[SSE_GROUP_SZ_SHA512],int KL[SSE_GROUP_SZ_SHA512],unsigned char * S[SSE_GROUP_SZ_SHA512],int SL[SSE_GROUP_SZ_SHA512],int R,unsigned char * out[SSE_GROUP_SZ_SHA512],int outlen,int skip_bytes)333 static void pbkdf2_sha512_sse_varying_salt(const unsigned char *K[SSE_GROUP_SZ_SHA512], int KL[SSE_GROUP_SZ_SHA512], unsigned char *S[SSE_GROUP_SZ_SHA512], int SL[SSE_GROUP_SZ_SHA512], int R, unsigned char *out[SSE_GROUP_SZ_SHA512], int outlen, int skip_bytes)
334 {
335 	unsigned char tmp_hash[SHA512_DIGEST_LENGTH];
336 	uint64_t *i1, *i2, *o1, *ptmp;
337 	unsigned int i, j;
338 	uint64_t dgst[SSE_GROUP_SZ_SHA512][SHA512_DIGEST_LENGTH/sizeof(uint64_t)];
339 	int loops, accum=0;
340 	unsigned char loop;
341 	SHA512_CTX ipad[SSE_GROUP_SZ_SHA512], opad[SSE_GROUP_SZ_SHA512], ctx;
342 
343 	// sse_hash1 would need to be 'adjusted' for SHA512_PARA
344 	JTR_ALIGN(MEM_ALIGN_SIMD) unsigned char sse_hash1[SHA_BUF_SIZ*sizeof(uint64_t)*SSE_GROUP_SZ_SHA512];
345 	JTR_ALIGN(MEM_ALIGN_SIMD) unsigned char sse_crypt1[SHA512_DIGEST_LENGTH*SSE_GROUP_SZ_SHA512];
346 	JTR_ALIGN(MEM_ALIGN_SIMD) unsigned char sse_crypt2[SHA512_DIGEST_LENGTH*SSE_GROUP_SZ_SHA512];
347 	i1 = (uint64_t*)sse_crypt1;
348 	i2 = (uint64_t*)sse_crypt2;
349 	o1 = (uint64_t*)sse_hash1;
350 
351 	// we need to set ONE time, the upper half of the data buffer.  We put the 0x80 byte (in BE format), at offset 64,
352 	// then zero out the rest of the buffer, putting 0x300 (#bits), into the proper location in the buffer.  Once this
353 	// part of the buffer is setup, we never touch it again, for the rest of the crypt.  We simply overwrite the first
354 	// half of this buffer, over and over again, with BE results of the prior hash.
355 	for (j = 0; j < SSE_GROUP_SZ_SHA512/SIMD_COEF_64; ++j) {
356 		ptmp = &o1[j*SIMD_COEF_64*SHA_BUF_SIZ];
357 		for (i = 0; i < SIMD_COEF_64; ++i)
358 			ptmp[ (SHA512_DIGEST_LENGTH/sizeof(uint64_t))*SIMD_COEF_64 + (i&(SIMD_COEF_64-1))] = 0x8000000000000000ULL;
359 		for (i = (SHA512_DIGEST_LENGTH/sizeof(uint64_t)+1)*SIMD_COEF_64; i < 15*SIMD_COEF_64; ++i)
360 			ptmp[i] = 0;
361 		for (i = 0; i < SIMD_COEF_64; ++i)
362 			ptmp[15*SIMD_COEF_64 + (i&(SIMD_COEF_64-1))] = ((128+SHA512_DIGEST_LENGTH)<<3); // all encrypts are 128+64 bytes.
363 	}
364 
365 	// Load up the IPAD and OPAD values, saving off the first half of the crypt.  We then push the ipad/opad all
366 	// the way to the end, and that ends up being the first iteration of the pbkdf2.  From that point on, we use
367 	// the 2 first halves, to load the sha512 2nd part of each crypt, in each loop.
368 	_pbkdf2_sha512_sse_load_hmac(K, KL, ipad, opad);
369 	for (j = 0; j < SSE_GROUP_SZ_SHA512; ++j) {
370 		ptmp = &i1[(j/SIMD_COEF_64)*SIMD_COEF_64*(SHA512_DIGEST_LENGTH/sizeof(uint64_t))+(j&(SIMD_COEF_64-1))];
371 		for (i = 0; i < (SHA512_DIGEST_LENGTH/sizeof(uint64_t)); ++i) {
372 #if COMMON_DIGEST_FOR_OPENSSL
373 			*ptmp = ipad[j].hash[i];
374 #else
375 			*ptmp = ipad[j].h[i];
376 #endif
377 			ptmp += SIMD_COEF_64;
378 		}
379 		ptmp = &i2[(j/SIMD_COEF_64)*SIMD_COEF_64*(SHA512_DIGEST_LENGTH/sizeof(uint64_t))+(j&(SIMD_COEF_64-1))];
380 		for (i = 0; i < (SHA512_DIGEST_LENGTH/sizeof(uint64_t)); ++i) {
381 #if COMMON_DIGEST_FOR_OPENSSL
382 			*ptmp = opad[j].hash[i];
383 #else
384 			*ptmp = opad[j].h[i];
385 #endif
386 			ptmp += SIMD_COEF_64;
387 		}
388 	}
389 
390 	loops = (skip_bytes + outlen + (SHA512_DIGEST_LENGTH-1)) / SHA512_DIGEST_LENGTH;
391 	loop = skip_bytes / SHA512_DIGEST_LENGTH + 1;
392 	while (loop <= loops) {
393 		for (j = 0; j < SSE_GROUP_SZ_SHA512; ++j) {
394 			memcpy(&ctx, &ipad[j], sizeof(ctx));
395 			SHA512_Update(&ctx, S[j], SL[j]);
396 			// this BE 1 appended to the salt, allows us to do passwords up
397 			// to and including 128 bytes long.  If we wanted longer passwords,
398 			// then we would have to call the HMAC multiple times (with the
399 			// rounds between, but each chunk of password we would use a larger
400 			// BE number appended to the salt. The first roung (64 byte pw), and
401 			// we simply append the first number (0001 in BE)
402 			SHA512_Update(&ctx, "\x0\x0\x0", 3);
403 			SHA512_Update(&ctx, &loop, 1);
404 			SHA512_Final(tmp_hash, &ctx);
405 
406 			memcpy(&ctx, &opad[j], sizeof(ctx));
407 			SHA512_Update(&ctx, tmp_hash, SHA512_DIGEST_LENGTH);
408 			SHA512_Final(tmp_hash, &ctx);
409 
410 			// now convert this from flat into SIMD_COEF_64 buffers.
411 			// Also, perform the 'first' ^= into the crypt buffer.  NOTE, we are doing that in BE format
412 			// so we will need to 'undo' that in the end.
413 			ptmp = &o1[(j/SIMD_COEF_64)*SIMD_COEF_64*SHA_BUF_SIZ+(j&(SIMD_COEF_64-1))];
414 			for (i = 0; i < (SHA512_DIGEST_LENGTH/sizeof(uint64_t)); ++i) {
415 #if COMMON_DIGEST_FOR_OPENSSL
416 				*ptmp = dgst[j][i] = ctx.hash[i];
417 #else
418 				*ptmp = dgst[j][i] = ctx.h[i];
419 #endif
420 				ptmp += SIMD_COEF_64;
421 			}
422 		}
423 
424 		// Here is the inner loop.  We loop from 1 to count.  iteration 0 was done in the ipad/opad computation.
425 		for (i = 1; i < R; i++) {
426 			unsigned int k;
427 			SIMDSHA512body(o1,o1,i1, SSEi_MIXED_IN|SSEi_RELOAD|SSEi_OUTPUT_AS_INP_FMT);
428 			SIMDSHA512body(o1,o1,i2, SSEi_MIXED_IN|SSEi_RELOAD|SSEi_OUTPUT_AS_INP_FMT);
429 			// only xor first 16 64-bit words
430 			for (k = 0; k < SSE_GROUP_SZ_SHA512; k++) {
431 				uint64_t *p = &o1[(k/SIMD_COEF_64)*SIMD_COEF_64*SHA_BUF_SIZ + (k&(SIMD_COEF_64-1))];
432 				for (j = 0; j < (SHA512_DIGEST_LENGTH/sizeof(uint64_t)); j++) {
433 					dgst[k][j] ^= p[j*SIMD_COEF_64];
434 #if defined (DPAPI_CRAP_LOGIC)
435 					p[(j*SIMD_COEF_64)] = dgst[k][j];
436 #endif
437 				}
438 			}
439 		}
440 
441 		// we must fixup final results.  We have been working in BE (NOT switching out of, just to switch back into it at every loop).
442 		// for the 'very' end of the crypt, we remove BE logic, so the calling function can view it in native format.
443 		alter_endianity_to_BE64(dgst, sizeof(dgst)/8);
444 		for (i = skip_bytes%SHA512_DIGEST_LENGTH; i < SHA512_DIGEST_LENGTH && accum < outlen; ++i) {
445 			for (j = 0; j < SSE_GROUP_SZ_SHA512; ++j) {
446 				out[j][accum] = ((unsigned char*)(dgst[j]))[i];
447 			}
448 			++accum;
449 		}
450 		++loop;
451 		skip_bytes = 0;
452 	}
453 }
454 
455 #endif
456 
457 #endif
458