1/* NTLM kernel (OpenCL 1.2 conformant)
2 *
3 * Written by Alain Espinosa <alainesp at gmail.com> in 2010 and modified by
4 * Samuele Giovanni Tonon in 2011. No copyright is claimed, and
5 * the software is hereby placed in the public domain.
6 * In case this attempt to disclaim copyright and place the software in the
7 * public domain is deemed null and void, then the software is
8 * Copyright (c) 2010 Alain Espinosa
9 * Copyright (c) 2011 Samuele Giovanni Tonon
10 * Copyright (c) 2015 Sayantan Datta <sdatta at openwall.com>
11 * Copyright (c) 2015 magnum
12 * and it is hereby released to the general public under the following terms:
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted.
16 *
17 * There's ABSOLUTELY NO WARRANTY, express or implied.
18 *
19 * (This is a heavily cut-down "BSD license".)
20 */
21
22#define AMD_PUTCHAR_NOCAST
23#include "opencl_misc.h"
24#include "opencl_md4.h"
25#include "opencl_unicode.h"
26#include "opencl_mask.h"
27
28//Init values
29#define INIT_A 0x67452301
30#define INIT_B 0xefcdab89
31#define INIT_C 0x98badcfe
32#define INIT_D 0x10325476
33
34#define SQRT_2 0x5a827999
35#define SQRT_3 0x6ed9eba1
36
37#if BITMAP_SIZE_BITS_LESS_ONE < 0xffffffff
38#define BITMAP_SIZE_BITS (BITMAP_SIZE_BITS_LESS_ONE + 1)
39#else
40#error BITMAP_SIZE_BITS_LESS_ONE too large
41#endif
42
43inline void nt_crypt(__private uint *hash, __private uint *nt_buffer, uint md4_size) {
44	uint tmp;
45
46	/* Round 1 */
47	hash[0] = 0xFFFFFFFF + nt_buffer[0]; hash[0]=rotate(hash[0], 3u);
48	hash[3] = INIT_D + (INIT_C ^ (hash[0] & 0x77777777)) + nt_buffer[1]; hash[3]=rotate(hash[3], 7u);
49	hash[2] = INIT_C + MD4_F(hash[3], hash[0], INIT_B)   + nt_buffer[2]; hash[2]=rotate(hash[2], 11u);
50	hash[1] = INIT_B + MD4_F(hash[2], hash[3], hash[0])  + nt_buffer[3]; hash[1]=rotate(hash[1], 19u);
51
52	hash[0] += MD4_F(hash[1], hash[2], hash[3])  +  nt_buffer[4] ; hash[0] = rotate(hash[0] , 3u );
53	hash[3] += MD4_F(hash[0], hash[1], hash[2])  +  nt_buffer[5] ; hash[3] = rotate(hash[3] , 7u );
54	hash[2] += MD4_F(hash[3], hash[0], hash[1])  +  nt_buffer[6] ; hash[2] = rotate(hash[2] , 11u);
55	hash[1] += MD4_F(hash[2], hash[3], hash[0])  +  nt_buffer[7] ; hash[1] = rotate(hash[1] , 19u);
56
57	hash[0] += MD4_F(hash[1], hash[2], hash[3])  +  nt_buffer[8] ; hash[0] = rotate(hash[0] , 3u );
58	hash[3] += MD4_F(hash[0], hash[1], hash[2])  +  nt_buffer[9] ; hash[3] = rotate(hash[3] , 7u );
59	hash[2] += MD4_F(hash[3], hash[0], hash[1])  +  nt_buffer[10]; hash[2] = rotate(hash[2] , 11u);
60	hash[1] += MD4_F(hash[2], hash[3], hash[0])  +  nt_buffer[11]; hash[1] = rotate(hash[1] , 19u);
61
62	hash[0] += MD4_F(hash[1], hash[2], hash[3])  +  nt_buffer[12]; hash[0] = rotate(hash[0] , 3u );
63	hash[3] += MD4_F(hash[0], hash[1], hash[2])  +  nt_buffer[13]; hash[3] = rotate(hash[3] , 7u );
64	hash[2] += MD4_F(hash[3], hash[0], hash[1])  +    md4_size   ; hash[2] = rotate(hash[2] , 11u);
65	hash[1] += MD4_F(hash[2], hash[3], hash[0])                  ; hash[1] = rotate(hash[1] , 19u);
66
67	/* Round 2 */
68
69	hash[0] += MD4_G(hash[1], hash[2], hash[3]) + nt_buffer[0] + SQRT_2; hash[0] = rotate(hash[0] , 3u );
70	hash[3] += MD4_G(hash[0], hash[1], hash[2]) + nt_buffer[4] + SQRT_2; hash[3] = rotate(hash[3] , 5u );
71	hash[2] += MD4_G(hash[3], hash[0], hash[1]) + nt_buffer[8] + SQRT_2; hash[2] = rotate(hash[2] , 9u );
72	hash[1] += MD4_G(hash[2], hash[3], hash[0]) + nt_buffer[12]+ SQRT_2; hash[1] = rotate(hash[1] , 13u);
73
74	hash[0] += MD4_G(hash[1], hash[2], hash[3]) + nt_buffer[1] + SQRT_2; hash[0] = rotate(hash[0] , 3u );
75	hash[3] += MD4_G(hash[0], hash[1], hash[2]) + nt_buffer[5] + SQRT_2; hash[3] = rotate(hash[3] , 5u );
76	hash[2] += MD4_G(hash[3], hash[0], hash[1]) + nt_buffer[9] + SQRT_2; hash[2] = rotate(hash[2] , 9u );
77	hash[1] += MD4_G(hash[2], hash[3], hash[0]) + nt_buffer[13]+ SQRT_2; hash[1] = rotate(hash[1] , 13u);
78
79	hash[0] += MD4_G(hash[1], hash[2], hash[3]) + nt_buffer[2] + SQRT_2; hash[0] = rotate(hash[0] , 3u );
80	hash[3] += MD4_G(hash[0], hash[1], hash[2]) + nt_buffer[6] + SQRT_2; hash[3] = rotate(hash[3] , 5u );
81	hash[2] += MD4_G(hash[3], hash[0], hash[1]) + nt_buffer[10]+ SQRT_2; hash[2] = rotate(hash[2] , 9u );
82	hash[1] += MD4_G(hash[2], hash[3], hash[0]) +   md4_size   + SQRT_2; hash[1] = rotate(hash[1] , 13u);
83
84	hash[0] += MD4_G(hash[1], hash[2], hash[3]) + nt_buffer[3] + SQRT_2; hash[0] = rotate(hash[0] , 3u );
85	hash[3] += MD4_G(hash[0], hash[1], hash[2]) + nt_buffer[7] + SQRT_2; hash[3] = rotate(hash[3] , 5u );
86	hash[2] += MD4_G(hash[3], hash[0], hash[1]) + nt_buffer[11]+ SQRT_2; hash[2] = rotate(hash[2] , 9u );
87	hash[1] += MD4_G(hash[2], hash[3], hash[0])                + SQRT_2; hash[1] = rotate(hash[1] , 13u);
88
89	/* Round 3 */
90	hash[0] += MD4_H(hash[1], hash[2], hash[3]) + nt_buffer[0]  + SQRT_3; hash[0] = rotate(hash[0] , 3u );
91	hash[3] += MD4_H2(hash[0], hash[1], hash[2]) + nt_buffer[8]  + SQRT_3; hash[3] = rotate(hash[3] , 9u );
92	hash[2] += MD4_H(hash[3], hash[0], hash[1]) + nt_buffer[4]  + SQRT_3; hash[2] = rotate(hash[2] , 11u);
93	hash[1] += MD4_H2(hash[2], hash[3], hash[0]) + nt_buffer[12] + SQRT_3; hash[1] = rotate(hash[1] , 15u);
94
95	hash[0] += MD4_H(hash[1], hash[2], hash[3]) + nt_buffer[2]  + SQRT_3; hash[0] = rotate(hash[0] , 3u );
96	hash[3] += MD4_H2(hash[0], hash[1], hash[2]) + nt_buffer[10] + SQRT_3; hash[3] = rotate(hash[3] , 9u );
97	hash[2] += MD4_H(hash[3], hash[0], hash[1]) + nt_buffer[6]  + SQRT_3; hash[2] = rotate(hash[2] , 11u);
98	hash[1] += MD4_H2(hash[2], hash[3], hash[0]) +   md4_size    + SQRT_3; hash[1] = rotate(hash[1] , 15u);
99
100	hash[0] += MD4_H(hash[1], hash[2], hash[3]) + nt_buffer[1]  + SQRT_3; hash[0] = rotate(hash[0] , 3u );
101	hash[3] += MD4_H2(hash[0], hash[1], hash[2]) + nt_buffer[9]  + SQRT_3; hash[3] = rotate(hash[3] , 9u );
102	hash[2] += MD4_H(hash[3], hash[0], hash[1]) + nt_buffer[5]  + SQRT_3; hash[2] = rotate(hash[2] , 11u);
103	//It is better to calculate this remining steps that access global memory
104	hash[1] += MD4_H2(hash[2], hash[3], hash[0]) + nt_buffer[13];
105	tmp = hash[1];
106	tmp += SQRT_3; tmp = rotate(tmp , 15u);
107
108	hash[0] += MD4_H(hash[3], hash[2], tmp) + nt_buffer[3]  + SQRT_3; hash[0] = rotate(hash[0] , 3u );
109	hash[3] += MD4_H2(hash[2], tmp, hash[0]) + nt_buffer[11] + SQRT_3; hash[3] = rotate(hash[3] , 9u );
110	hash[2] += MD4_H(tmp, hash[0], hash[3]) + nt_buffer[7]  + SQRT_3; hash[2] = rotate(hash[2] , 11u);
111}
112
113#if __OS_X__ && (cpu(DEVICE_INFO) || gpu_nvidia(DEVICE_INFO))
114/* This is a workaround for driver/runtime bugs */
115#define MAYBE_VOLATILE volatile
116#else
117#define MAYBE_VOLATILE
118#endif
119
120#if UTF_8
121
122inline uint prepare_key(__global uint *key, uint length,
123                        MAYBE_VOLATILE uint *nt_buffer)
124{
125	const __global UTF8 *source = (const __global UTF8*)key;
126	const __global UTF8 *sourceEnd = &source[length];
127	MAYBE_VOLATILE UTF16 *target = (UTF16*)nt_buffer;
128	MAYBE_VOLATILE const UTF16 *targetEnd = &target[PLAINTEXT_LENGTH];
129	UTF32 ch;
130	uint extraBytesToRead;
131
132	/* Input buffer is UTF-8 without zero-termination */
133	while (source < sourceEnd) {
134		if (*source < 0xC0) {
135			*target++ = (UTF16)*source++;
136			if (target >= targetEnd)
137				break;
138			continue;
139		}
140		ch = *source;
141		// This point must not be reached with *source < 0xC0
142		extraBytesToRead =
143			opt_trailingBytesUTF8[ch & 0x3f];
144		if (source + extraBytesToRead >= sourceEnd) {
145			break;
146		}
147		switch (extraBytesToRead) {
148		case 3:
149			ch <<= 6;
150			ch += *++source;
151		case 2:
152			ch <<= 6;
153			ch += *++source;
154		case 1:
155			ch <<= 6;
156			ch += *++source;
157			++source;
158			break;
159		default:
160			*target = UNI_REPLACEMENT_CHAR;
161			break; // from switch
162		}
163		if (*target == UNI_REPLACEMENT_CHAR)
164			break; // from while
165		ch -= offsetsFromUTF8[extraBytesToRead];
166#ifdef UCS_2
167		/* UCS-2 only */
168		*target++ = (UTF16)ch;
169#else
170		/* full UTF-16 with surrogate pairs */
171		if (ch <= UNI_MAX_BMP) {  /* Target is a character <= 0xFFFF */
172			*target++ = (UTF16)ch;
173		} else {  /* target is a character in range 0xFFFF - 0x10FFFF. */
174			if (target + 1 >= targetEnd)
175				break;
176			ch -= halfBase;
177			*target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
178			*target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
179		}
180#endif
181		if (target >= targetEnd)
182			break;
183	}
184	*target = 0x80;	// Terminate
185
186	return (uint)(target - (UTF16*)nt_buffer);
187}
188
189#else
190
191inline uint prepare_key(__global uint *key, uint length, uint *nt_buffer)
192{
193	uint i, nt_index, keychars;
194
195	nt_index = 0;
196	for (i = 0; i < (length + 3)/ 4; i++) {
197		keychars = key[i];
198		nt_buffer[nt_index++] = CP_LUT(keychars & 0x000000FF) | (CP_LUT((keychars & 0x0000FF00) >> 8) << 16);
199		nt_buffer[nt_index++] = CP_LUT((keychars & 0x00FF0000) >> 16) | (CP_LUT(keychars >> 24) << 16);
200	}
201	nt_index = length >> 1;
202	nt_buffer[nt_index] = (nt_buffer[nt_index] & 0xFFFF) | (0x80 << ((length & 1) << 4));
203
204	return length;
205}
206
207#endif /* UTF_8 */
208
209inline void cmp_final(uint gid,
210		uint iter,
211		__private uint *hash,
212		__global uint *offset_table,
213		__global uint *hash_table,
214		__global uint *return_hashes,
215		volatile __global uint *output,
216		volatile __global uint *bitmap_dupe) {
217
218	uint t, offset_table_index, hash_table_index;
219	unsigned long LO, HI;
220	unsigned long p;
221
222	HI = ((unsigned long)hash[3] << 32) | (unsigned long)hash[2];
223	LO = ((unsigned long)hash[1] << 32) | (unsigned long)hash[0];
224
225	p = (HI % OFFSET_TABLE_SIZE) * SHIFT64_OT_SZ;
226	p += LO % OFFSET_TABLE_SIZE;
227	p %= OFFSET_TABLE_SIZE;
228	offset_table_index = (unsigned int)p;
229
230	//error: chances of overflow is extremely low.
231	LO += (unsigned long)offset_table[offset_table_index];
232
233	p = (HI % HASH_TABLE_SIZE) * SHIFT64_HT_SZ;
234	p += LO % HASH_TABLE_SIZE;
235	p %= HASH_TABLE_SIZE;
236	hash_table_index = (unsigned int)p;
237
238	if (hash_table[hash_table_index] == hash[0])
239	if (hash_table[HASH_TABLE_SIZE + hash_table_index] == hash[1])
240	{
241/*
242 * Prevent duplicate keys from cracking same hash
243 */
244		if (!(atomic_or(&bitmap_dupe[hash_table_index/32], (1U << (hash_table_index % 32))) & (1U << (hash_table_index % 32)))) {
245			t = atomic_inc(&output[0]);
246			output[1 + 3 * t] = gid;
247			output[2 + 3 * t] = iter;
248			output[3 + 3 * t] = hash_table_index;
249			return_hashes[2 * t] = hash[2];
250			return_hashes[2 * t + 1] = hash[3];
251		}
252	}
253}
254
255inline void cmp(uint gid,
256		uint iter,
257		__private uint *hash,
258#if USE_LOCAL_BITMAPS
259		__local
260#else
261		__global
262#endif
263		uint *bitmaps,
264		__global uint *offset_table,
265		__global uint *hash_table,
266		__global uint *return_hashes,
267		volatile __global uint *output,
268		volatile __global uint *bitmap_dupe) {
269	uint bitmap_index, tmp = 1;
270
271/*	hash[0] += 0x67452301;
272	hash[1] += 0xefcdab89;
273	hash[2] += 0x98badcfe;
274	hash[3] += 0x10325476;*/
275
276#if SELECT_CMP_STEPS > 4
277	bitmap_index = hash[0] & (BITMAP_SIZE_BITS - 1);
278	tmp &= (bitmaps[bitmap_index >> 5] >> (bitmap_index & 31)) & 1U;
279	bitmap_index = (hash[0] >> 16) & (BITMAP_SIZE_BITS - 1);
280	tmp &= (bitmaps[(BITMAP_SIZE_BITS >> 5) + (bitmap_index >> 5)] >> (bitmap_index & 31)) & 1U;
281	bitmap_index = hash[1] & (BITMAP_SIZE_BITS - 1);
282	tmp &= (bitmaps[(BITMAP_SIZE_BITS >> 4) + (bitmap_index >> 5)] >> (bitmap_index & 31)) & 1U;
283	bitmap_index = (hash[1] >> 16) & (BITMAP_SIZE_BITS - 1);
284	tmp &= (bitmaps[(BITMAP_SIZE_BITS >> 5) * 3 + (bitmap_index >> 5)] >> (bitmap_index & 31)) & 1U;
285	bitmap_index = hash[2] & (BITMAP_SIZE_BITS - 1);
286	tmp &= (bitmaps[(BITMAP_SIZE_BITS >> 3) + (bitmap_index >> 5)] >> (bitmap_index & 31)) & 1U;
287	bitmap_index = (hash[2] >> 16) & (BITMAP_SIZE_BITS - 1);
288	tmp &= (bitmaps[(BITMAP_SIZE_BITS >> 5) * 5 + (bitmap_index >> 5)] >> (bitmap_index & 31)) & 1U;
289	bitmap_index = hash[3] & (BITMAP_SIZE_BITS - 1);
290	tmp &= (bitmaps[(BITMAP_SIZE_BITS >> 5) * 6 + (bitmap_index >> 5)] >> (bitmap_index & 31)) & 1U;
291	bitmap_index = (hash[3] >> 16) & (BITMAP_SIZE_BITS - 1);
292	tmp &= (bitmaps[(BITMAP_SIZE_BITS >> 5) * 7 + (bitmap_index >> 5)] >> (bitmap_index & 31)) & 1U;
293#elif SELECT_CMP_STEPS > 2
294	bitmap_index = hash[3] & (BITMAP_SIZE_BITS - 1);
295	tmp &= (bitmaps[bitmap_index >> 5] >> (bitmap_index & 31)) & 1U;
296	bitmap_index = hash[2] & (BITMAP_SIZE_BITS - 1);
297	tmp &= (bitmaps[(BITMAP_SIZE_BITS >> 5) + (bitmap_index >> 5)] >> (bitmap_index & 31)) & 1U;
298	bitmap_index = hash[1] & (BITMAP_SIZE_BITS - 1);
299	tmp &= (bitmaps[(BITMAP_SIZE_BITS >> 4) + (bitmap_index >> 5)] >> (bitmap_index & 31)) & 1U;
300	bitmap_index = hash[0] & (BITMAP_SIZE_BITS - 1);
301	tmp &= (bitmaps[(BITMAP_SIZE_BITS >> 5) * 3 + (bitmap_index >> 5)] >> (bitmap_index & 31)) & 1U;
302#elif SELECT_CMP_STEPS > 1
303	bitmap_index = hash[3] & (BITMAP_SIZE_BITS - 1);
304	tmp &= (bitmaps[bitmap_index >> 5] >> (bitmap_index & 31)) & 1U;
305	bitmap_index = hash[2] & (BITMAP_SIZE_BITS - 1);
306	tmp &= (bitmaps[(BITMAP_SIZE_BITS >> 5) + (bitmap_index >> 5)] >> (bitmap_index & 31)) & 1U;
307#else
308	bitmap_index = hash[3] & BITMAP_SIZE_BITS_LESS_ONE;
309	tmp &= (bitmaps[bitmap_index >> 5] >> (bitmap_index & 31)) & 1U;
310#endif
311
312	if (tmp)
313		cmp_final(gid, iter, hash, offset_table, hash_table, return_hashes, output, bitmap_dupe);
314}
315
316#define USE_CONST_CACHE \
317	(CONST_CACHE_SIZE >= (NUM_INT_KEYS * 4))
318
319/* OpenCL kernel entry point. Copy key to be hashed from
320 * global to local (thread) memory. Break the key into 16 32-bit (uint)
321 * words. MD4 hash of a key is 128 bit (uint4). */
322__kernel void nt(__global uint *keys,
323		  __global uint *index,
324		  __global uint *int_key_loc,
325#if USE_CONST_CACHE
326		  constant
327#else
328		  __global
329#endif
330		  uint *int_keys
331#if !defined(__OS_X__) && USE_CONST_CACHE && gpu_amd(DEVICE_INFO)
332		__attribute__((max_constant_size (NUM_INT_KEYS * 4)))
333#endif
334		 , __global uint *bitmaps,
335		  __global uint *offset_table,
336		  __global uint *hash_table,
337		  __global uint *return_hashes,
338		  volatile __global uint *out_hash_ids,
339		  volatile __global uint *bitmap_dupe)
340{
341	uint i;
342	uint gid = get_global_id(0);
343	uint base = index[gid];
344	uint nt_buffer[14] = { 0 };
345	uint md4_size = base & 127;
346	uint hash[4];
347
348#if NUM_INT_KEYS > 1 && !IS_STATIC_GPU_MASK
349	uint ikl = int_key_loc[gid];
350	uint loc0 = ikl & 0xff;
351#if MASK_FMT_INT_PLHDR > 1
352#if LOC_1 >= 0
353	uint loc1 = (ikl & 0xff00) >> 8;
354#endif
355#endif
356#if MASK_FMT_INT_PLHDR > 2
357#if LOC_2 >= 0
358	uint loc2 = (ikl & 0xff0000) >> 16;
359#endif
360#endif
361#if MASK_FMT_INT_PLHDR > 3
362#if LOC_3 >= 0
363	uint loc3 = (ikl & 0xff000000) >> 24;
364#endif
365#endif
366#endif
367
368#if !IS_STATIC_GPU_MASK
369#define GPU_LOC_0 loc0
370#define GPU_LOC_1 loc1
371#define GPU_LOC_2 loc2
372#define GPU_LOC_3 loc3
373#else
374#define GPU_LOC_0 LOC_0
375#define GPU_LOC_1 LOC_1
376#define GPU_LOC_2 LOC_2
377#define GPU_LOC_3 LOC_3
378#endif
379
380#if USE_LOCAL_BITMAPS
381	uint lid = get_local_id(0);
382	uint lws = get_local_size(0);
383	uint __local s_bitmaps[(BITMAP_SIZE_BITS >> 5) * SELECT_CMP_STEPS];
384
385	for (i = 0; i < (((BITMAP_SIZE_BITS >> 5) * SELECT_CMP_STEPS) / lws); i++)
386		s_bitmaps[i*lws + lid] = bitmaps[i*lws + lid];
387
388	barrier(CLK_LOCAL_MEM_FENCE);
389#endif
390
391	keys += base >> 7;
392	md4_size = prepare_key(keys, md4_size, nt_buffer);
393	md4_size = md4_size << 4;
394
395	for (i = 0; i < NUM_INT_KEYS; i++) {
396#if NUM_INT_KEYS > 1
397		PUTSHORT(nt_buffer, GPU_LOC_0, CP_LUT(int_keys[i] & 0xff));
398#if MASK_FMT_INT_PLHDR > 1
399#if LOC_1 >= 0
400		PUTSHORT(nt_buffer, GPU_LOC_1, CP_LUT((int_keys[i] & 0xff00) >> 8));
401#endif
402#endif
403#if MASK_FMT_INT_PLHDR > 2
404#if LOC_2 >= 0
405		PUTSHORT(nt_buffer, GPU_LOC_2, CP_LUT((int_keys[i] & 0xff0000) >> 16));
406#endif
407#endif
408#if MASK_FMT_INT_PLHDR > 3
409#if LOC_3 >= 0
410		PUTSHORT(nt_buffer, GPU_LOC_3, CP_LUT((int_keys[i] & 0xff000000) >> 24));
411#endif
412#endif
413#endif
414		nt_crypt(hash, nt_buffer, md4_size);
415		cmp(gid, i, hash,
416#if USE_LOCAL_BITMAPS
417		    s_bitmaps
418#else
419		    bitmaps
420#endif
421		    , offset_table, hash_table, return_hashes, out_hash_ids, bitmap_dupe);
422	}
423}
424