1/* NTLM kernel (OpenCL 1.2 conformant) 2 * 3 * Written by Alain Espinosa <alainesp at gmail.com> in 2010 and modified by 4 * Samuele Giovanni Tonon in 2011. No copyright is claimed, and 5 * the software is hereby placed in the public domain. 6 * In case this attempt to disclaim copyright and place the software in the 7 * public domain is deemed null and void, then the software is 8 * Copyright (c) 2010 Alain Espinosa 9 * Copyright (c) 2011 Samuele Giovanni Tonon 10 * Copyright (c) 2015 Sayantan Datta <sdatta at openwall.com> 11 * Copyright (c) 2015 magnum 12 * and it is hereby released to the general public under the following terms: 13 * 14 * Redistribution and use in source and binary forms, with or without 15 * modification, are permitted. 16 * 17 * There's ABSOLUTELY NO WARRANTY, express or implied. 18 * 19 * (This is a heavily cut-down "BSD license".) 20 */ 21 22#define AMD_PUTCHAR_NOCAST 23#include "opencl_misc.h" 24#include "opencl_md4.h" 25#include "opencl_unicode.h" 26#include "opencl_mask.h" 27 28//Init values 29#define INIT_A 0x67452301 30#define INIT_B 0xefcdab89 31#define INIT_C 0x98badcfe 32#define INIT_D 0x10325476 33 34#define SQRT_2 0x5a827999 35#define SQRT_3 0x6ed9eba1 36 37#if BITMAP_SIZE_BITS_LESS_ONE < 0xffffffff 38#define BITMAP_SIZE_BITS (BITMAP_SIZE_BITS_LESS_ONE + 1) 39#else 40#error BITMAP_SIZE_BITS_LESS_ONE too large 41#endif 42 43inline void nt_crypt(__private uint *hash, __private uint *nt_buffer, uint md4_size) { 44 uint tmp; 45 46 /* Round 1 */ 47 hash[0] = 0xFFFFFFFF + nt_buffer[0]; hash[0]=rotate(hash[0], 3u); 48 hash[3] = INIT_D + (INIT_C ^ (hash[0] & 0x77777777)) + nt_buffer[1]; hash[3]=rotate(hash[3], 7u); 49 hash[2] = INIT_C + MD4_F(hash[3], hash[0], INIT_B) + nt_buffer[2]; hash[2]=rotate(hash[2], 11u); 50 hash[1] = INIT_B + MD4_F(hash[2], hash[3], hash[0]) + nt_buffer[3]; hash[1]=rotate(hash[1], 19u); 51 52 hash[0] += MD4_F(hash[1], hash[2], hash[3]) + nt_buffer[4] ; hash[0] = rotate(hash[0] , 3u ); 53 hash[3] += MD4_F(hash[0], hash[1], hash[2]) + nt_buffer[5] ; hash[3] = rotate(hash[3] , 7u ); 54 hash[2] += MD4_F(hash[3], hash[0], hash[1]) + nt_buffer[6] ; hash[2] = rotate(hash[2] , 11u); 55 hash[1] += MD4_F(hash[2], hash[3], hash[0]) + nt_buffer[7] ; hash[1] = rotate(hash[1] , 19u); 56 57 hash[0] += MD4_F(hash[1], hash[2], hash[3]) + nt_buffer[8] ; hash[0] = rotate(hash[0] , 3u ); 58 hash[3] += MD4_F(hash[0], hash[1], hash[2]) + nt_buffer[9] ; hash[3] = rotate(hash[3] , 7u ); 59 hash[2] += MD4_F(hash[3], hash[0], hash[1]) + nt_buffer[10]; hash[2] = rotate(hash[2] , 11u); 60 hash[1] += MD4_F(hash[2], hash[3], hash[0]) + nt_buffer[11]; hash[1] = rotate(hash[1] , 19u); 61 62 hash[0] += MD4_F(hash[1], hash[2], hash[3]) + nt_buffer[12]; hash[0] = rotate(hash[0] , 3u ); 63 hash[3] += MD4_F(hash[0], hash[1], hash[2]) + nt_buffer[13]; hash[3] = rotate(hash[3] , 7u ); 64 hash[2] += MD4_F(hash[3], hash[0], hash[1]) + md4_size ; hash[2] = rotate(hash[2] , 11u); 65 hash[1] += MD4_F(hash[2], hash[3], hash[0]) ; hash[1] = rotate(hash[1] , 19u); 66 67 /* Round 2 */ 68 69 hash[0] += MD4_G(hash[1], hash[2], hash[3]) + nt_buffer[0] + SQRT_2; hash[0] = rotate(hash[0] , 3u ); 70 hash[3] += MD4_G(hash[0], hash[1], hash[2]) + nt_buffer[4] + SQRT_2; hash[3] = rotate(hash[3] , 5u ); 71 hash[2] += MD4_G(hash[3], hash[0], hash[1]) + nt_buffer[8] + SQRT_2; hash[2] = rotate(hash[2] , 9u ); 72 hash[1] += MD4_G(hash[2], hash[3], hash[0]) + nt_buffer[12]+ SQRT_2; hash[1] = rotate(hash[1] , 13u); 73 74 hash[0] += MD4_G(hash[1], hash[2], hash[3]) + nt_buffer[1] + SQRT_2; hash[0] = rotate(hash[0] , 3u ); 75 hash[3] += MD4_G(hash[0], hash[1], hash[2]) + nt_buffer[5] + SQRT_2; hash[3] = rotate(hash[3] , 5u ); 76 hash[2] += MD4_G(hash[3], hash[0], hash[1]) + nt_buffer[9] + SQRT_2; hash[2] = rotate(hash[2] , 9u ); 77 hash[1] += MD4_G(hash[2], hash[3], hash[0]) + nt_buffer[13]+ SQRT_2; hash[1] = rotate(hash[1] , 13u); 78 79 hash[0] += MD4_G(hash[1], hash[2], hash[3]) + nt_buffer[2] + SQRT_2; hash[0] = rotate(hash[0] , 3u ); 80 hash[3] += MD4_G(hash[0], hash[1], hash[2]) + nt_buffer[6] + SQRT_2; hash[3] = rotate(hash[3] , 5u ); 81 hash[2] += MD4_G(hash[3], hash[0], hash[1]) + nt_buffer[10]+ SQRT_2; hash[2] = rotate(hash[2] , 9u ); 82 hash[1] += MD4_G(hash[2], hash[3], hash[0]) + md4_size + SQRT_2; hash[1] = rotate(hash[1] , 13u); 83 84 hash[0] += MD4_G(hash[1], hash[2], hash[3]) + nt_buffer[3] + SQRT_2; hash[0] = rotate(hash[0] , 3u ); 85 hash[3] += MD4_G(hash[0], hash[1], hash[2]) + nt_buffer[7] + SQRT_2; hash[3] = rotate(hash[3] , 5u ); 86 hash[2] += MD4_G(hash[3], hash[0], hash[1]) + nt_buffer[11]+ SQRT_2; hash[2] = rotate(hash[2] , 9u ); 87 hash[1] += MD4_G(hash[2], hash[3], hash[0]) + SQRT_2; hash[1] = rotate(hash[1] , 13u); 88 89 /* Round 3 */ 90 hash[0] += MD4_H(hash[1], hash[2], hash[3]) + nt_buffer[0] + SQRT_3; hash[0] = rotate(hash[0] , 3u ); 91 hash[3] += MD4_H2(hash[0], hash[1], hash[2]) + nt_buffer[8] + SQRT_3; hash[3] = rotate(hash[3] , 9u ); 92 hash[2] += MD4_H(hash[3], hash[0], hash[1]) + nt_buffer[4] + SQRT_3; hash[2] = rotate(hash[2] , 11u); 93 hash[1] += MD4_H2(hash[2], hash[3], hash[0]) + nt_buffer[12] + SQRT_3; hash[1] = rotate(hash[1] , 15u); 94 95 hash[0] += MD4_H(hash[1], hash[2], hash[3]) + nt_buffer[2] + SQRT_3; hash[0] = rotate(hash[0] , 3u ); 96 hash[3] += MD4_H2(hash[0], hash[1], hash[2]) + nt_buffer[10] + SQRT_3; hash[3] = rotate(hash[3] , 9u ); 97 hash[2] += MD4_H(hash[3], hash[0], hash[1]) + nt_buffer[6] + SQRT_3; hash[2] = rotate(hash[2] , 11u); 98 hash[1] += MD4_H2(hash[2], hash[3], hash[0]) + md4_size + SQRT_3; hash[1] = rotate(hash[1] , 15u); 99 100 hash[0] += MD4_H(hash[1], hash[2], hash[3]) + nt_buffer[1] + SQRT_3; hash[0] = rotate(hash[0] , 3u ); 101 hash[3] += MD4_H2(hash[0], hash[1], hash[2]) + nt_buffer[9] + SQRT_3; hash[3] = rotate(hash[3] , 9u ); 102 hash[2] += MD4_H(hash[3], hash[0], hash[1]) + nt_buffer[5] + SQRT_3; hash[2] = rotate(hash[2] , 11u); 103 //It is better to calculate this remining steps that access global memory 104 hash[1] += MD4_H2(hash[2], hash[3], hash[0]) + nt_buffer[13]; 105 tmp = hash[1]; 106 tmp += SQRT_3; tmp = rotate(tmp , 15u); 107 108 hash[0] += MD4_H(hash[3], hash[2], tmp) + nt_buffer[3] + SQRT_3; hash[0] = rotate(hash[0] , 3u ); 109 hash[3] += MD4_H2(hash[2], tmp, hash[0]) + nt_buffer[11] + SQRT_3; hash[3] = rotate(hash[3] , 9u ); 110 hash[2] += MD4_H(tmp, hash[0], hash[3]) + nt_buffer[7] + SQRT_3; hash[2] = rotate(hash[2] , 11u); 111} 112 113#if __OS_X__ && (cpu(DEVICE_INFO) || gpu_nvidia(DEVICE_INFO)) 114/* This is a workaround for driver/runtime bugs */ 115#define MAYBE_VOLATILE volatile 116#else 117#define MAYBE_VOLATILE 118#endif 119 120#if UTF_8 121 122inline uint prepare_key(__global uint *key, uint length, 123 MAYBE_VOLATILE uint *nt_buffer) 124{ 125 const __global UTF8 *source = (const __global UTF8*)key; 126 const __global UTF8 *sourceEnd = &source[length]; 127 MAYBE_VOLATILE UTF16 *target = (UTF16*)nt_buffer; 128 MAYBE_VOLATILE const UTF16 *targetEnd = &target[PLAINTEXT_LENGTH]; 129 UTF32 ch; 130 uint extraBytesToRead; 131 132 /* Input buffer is UTF-8 without zero-termination */ 133 while (source < sourceEnd) { 134 if (*source < 0xC0) { 135 *target++ = (UTF16)*source++; 136 if (target >= targetEnd) 137 break; 138 continue; 139 } 140 ch = *source; 141 // This point must not be reached with *source < 0xC0 142 extraBytesToRead = 143 opt_trailingBytesUTF8[ch & 0x3f]; 144 if (source + extraBytesToRead >= sourceEnd) { 145 break; 146 } 147 switch (extraBytesToRead) { 148 case 3: 149 ch <<= 6; 150 ch += *++source; 151 case 2: 152 ch <<= 6; 153 ch += *++source; 154 case 1: 155 ch <<= 6; 156 ch += *++source; 157 ++source; 158 break; 159 default: 160 *target = UNI_REPLACEMENT_CHAR; 161 break; // from switch 162 } 163 if (*target == UNI_REPLACEMENT_CHAR) 164 break; // from while 165 ch -= offsetsFromUTF8[extraBytesToRead]; 166#ifdef UCS_2 167 /* UCS-2 only */ 168 *target++ = (UTF16)ch; 169#else 170 /* full UTF-16 with surrogate pairs */ 171 if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */ 172 *target++ = (UTF16)ch; 173 } else { /* target is a character in range 0xFFFF - 0x10FFFF. */ 174 if (target + 1 >= targetEnd) 175 break; 176 ch -= halfBase; 177 *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START); 178 *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START); 179 } 180#endif 181 if (target >= targetEnd) 182 break; 183 } 184 *target = 0x80; // Terminate 185 186 return (uint)(target - (UTF16*)nt_buffer); 187} 188 189#else 190 191inline uint prepare_key(__global uint *key, uint length, uint *nt_buffer) 192{ 193 uint i, nt_index, keychars; 194 195 nt_index = 0; 196 for (i = 0; i < (length + 3)/ 4; i++) { 197 keychars = key[i]; 198 nt_buffer[nt_index++] = CP_LUT(keychars & 0x000000FF) | (CP_LUT((keychars & 0x0000FF00) >> 8) << 16); 199 nt_buffer[nt_index++] = CP_LUT((keychars & 0x00FF0000) >> 16) | (CP_LUT(keychars >> 24) << 16); 200 } 201 nt_index = length >> 1; 202 nt_buffer[nt_index] = (nt_buffer[nt_index] & 0xFFFF) | (0x80 << ((length & 1) << 4)); 203 204 return length; 205} 206 207#endif /* UTF_8 */ 208 209inline void cmp_final(uint gid, 210 uint iter, 211 __private uint *hash, 212 __global uint *offset_table, 213 __global uint *hash_table, 214 __global uint *return_hashes, 215 volatile __global uint *output, 216 volatile __global uint *bitmap_dupe) { 217 218 uint t, offset_table_index, hash_table_index; 219 unsigned long LO, HI; 220 unsigned long p; 221 222 HI = ((unsigned long)hash[3] << 32) | (unsigned long)hash[2]; 223 LO = ((unsigned long)hash[1] << 32) | (unsigned long)hash[0]; 224 225 p = (HI % OFFSET_TABLE_SIZE) * SHIFT64_OT_SZ; 226 p += LO % OFFSET_TABLE_SIZE; 227 p %= OFFSET_TABLE_SIZE; 228 offset_table_index = (unsigned int)p; 229 230 //error: chances of overflow is extremely low. 231 LO += (unsigned long)offset_table[offset_table_index]; 232 233 p = (HI % HASH_TABLE_SIZE) * SHIFT64_HT_SZ; 234 p += LO % HASH_TABLE_SIZE; 235 p %= HASH_TABLE_SIZE; 236 hash_table_index = (unsigned int)p; 237 238 if (hash_table[hash_table_index] == hash[0]) 239 if (hash_table[HASH_TABLE_SIZE + hash_table_index] == hash[1]) 240 { 241/* 242 * Prevent duplicate keys from cracking same hash 243 */ 244 if (!(atomic_or(&bitmap_dupe[hash_table_index/32], (1U << (hash_table_index % 32))) & (1U << (hash_table_index % 32)))) { 245 t = atomic_inc(&output[0]); 246 output[1 + 3 * t] = gid; 247 output[2 + 3 * t] = iter; 248 output[3 + 3 * t] = hash_table_index; 249 return_hashes[2 * t] = hash[2]; 250 return_hashes[2 * t + 1] = hash[3]; 251 } 252 } 253} 254 255inline void cmp(uint gid, 256 uint iter, 257 __private uint *hash, 258#if USE_LOCAL_BITMAPS 259 __local 260#else 261 __global 262#endif 263 uint *bitmaps, 264 __global uint *offset_table, 265 __global uint *hash_table, 266 __global uint *return_hashes, 267 volatile __global uint *output, 268 volatile __global uint *bitmap_dupe) { 269 uint bitmap_index, tmp = 1; 270 271/* hash[0] += 0x67452301; 272 hash[1] += 0xefcdab89; 273 hash[2] += 0x98badcfe; 274 hash[3] += 0x10325476;*/ 275 276#if SELECT_CMP_STEPS > 4 277 bitmap_index = hash[0] & (BITMAP_SIZE_BITS - 1); 278 tmp &= (bitmaps[bitmap_index >> 5] >> (bitmap_index & 31)) & 1U; 279 bitmap_index = (hash[0] >> 16) & (BITMAP_SIZE_BITS - 1); 280 tmp &= (bitmaps[(BITMAP_SIZE_BITS >> 5) + (bitmap_index >> 5)] >> (bitmap_index & 31)) & 1U; 281 bitmap_index = hash[1] & (BITMAP_SIZE_BITS - 1); 282 tmp &= (bitmaps[(BITMAP_SIZE_BITS >> 4) + (bitmap_index >> 5)] >> (bitmap_index & 31)) & 1U; 283 bitmap_index = (hash[1] >> 16) & (BITMAP_SIZE_BITS - 1); 284 tmp &= (bitmaps[(BITMAP_SIZE_BITS >> 5) * 3 + (bitmap_index >> 5)] >> (bitmap_index & 31)) & 1U; 285 bitmap_index = hash[2] & (BITMAP_SIZE_BITS - 1); 286 tmp &= (bitmaps[(BITMAP_SIZE_BITS >> 3) + (bitmap_index >> 5)] >> (bitmap_index & 31)) & 1U; 287 bitmap_index = (hash[2] >> 16) & (BITMAP_SIZE_BITS - 1); 288 tmp &= (bitmaps[(BITMAP_SIZE_BITS >> 5) * 5 + (bitmap_index >> 5)] >> (bitmap_index & 31)) & 1U; 289 bitmap_index = hash[3] & (BITMAP_SIZE_BITS - 1); 290 tmp &= (bitmaps[(BITMAP_SIZE_BITS >> 5) * 6 + (bitmap_index >> 5)] >> (bitmap_index & 31)) & 1U; 291 bitmap_index = (hash[3] >> 16) & (BITMAP_SIZE_BITS - 1); 292 tmp &= (bitmaps[(BITMAP_SIZE_BITS >> 5) * 7 + (bitmap_index >> 5)] >> (bitmap_index & 31)) & 1U; 293#elif SELECT_CMP_STEPS > 2 294 bitmap_index = hash[3] & (BITMAP_SIZE_BITS - 1); 295 tmp &= (bitmaps[bitmap_index >> 5] >> (bitmap_index & 31)) & 1U; 296 bitmap_index = hash[2] & (BITMAP_SIZE_BITS - 1); 297 tmp &= (bitmaps[(BITMAP_SIZE_BITS >> 5) + (bitmap_index >> 5)] >> (bitmap_index & 31)) & 1U; 298 bitmap_index = hash[1] & (BITMAP_SIZE_BITS - 1); 299 tmp &= (bitmaps[(BITMAP_SIZE_BITS >> 4) + (bitmap_index >> 5)] >> (bitmap_index & 31)) & 1U; 300 bitmap_index = hash[0] & (BITMAP_SIZE_BITS - 1); 301 tmp &= (bitmaps[(BITMAP_SIZE_BITS >> 5) * 3 + (bitmap_index >> 5)] >> (bitmap_index & 31)) & 1U; 302#elif SELECT_CMP_STEPS > 1 303 bitmap_index = hash[3] & (BITMAP_SIZE_BITS - 1); 304 tmp &= (bitmaps[bitmap_index >> 5] >> (bitmap_index & 31)) & 1U; 305 bitmap_index = hash[2] & (BITMAP_SIZE_BITS - 1); 306 tmp &= (bitmaps[(BITMAP_SIZE_BITS >> 5) + (bitmap_index >> 5)] >> (bitmap_index & 31)) & 1U; 307#else 308 bitmap_index = hash[3] & BITMAP_SIZE_BITS_LESS_ONE; 309 tmp &= (bitmaps[bitmap_index >> 5] >> (bitmap_index & 31)) & 1U; 310#endif 311 312 if (tmp) 313 cmp_final(gid, iter, hash, offset_table, hash_table, return_hashes, output, bitmap_dupe); 314} 315 316#define USE_CONST_CACHE \ 317 (CONST_CACHE_SIZE >= (NUM_INT_KEYS * 4)) 318 319/* OpenCL kernel entry point. Copy key to be hashed from 320 * global to local (thread) memory. Break the key into 16 32-bit (uint) 321 * words. MD4 hash of a key is 128 bit (uint4). */ 322__kernel void nt(__global uint *keys, 323 __global uint *index, 324 __global uint *int_key_loc, 325#if USE_CONST_CACHE 326 constant 327#else 328 __global 329#endif 330 uint *int_keys 331#if !defined(__OS_X__) && USE_CONST_CACHE && gpu_amd(DEVICE_INFO) 332 __attribute__((max_constant_size (NUM_INT_KEYS * 4))) 333#endif 334 , __global uint *bitmaps, 335 __global uint *offset_table, 336 __global uint *hash_table, 337 __global uint *return_hashes, 338 volatile __global uint *out_hash_ids, 339 volatile __global uint *bitmap_dupe) 340{ 341 uint i; 342 uint gid = get_global_id(0); 343 uint base = index[gid]; 344 uint nt_buffer[14] = { 0 }; 345 uint md4_size = base & 127; 346 uint hash[4]; 347 348#if NUM_INT_KEYS > 1 && !IS_STATIC_GPU_MASK 349 uint ikl = int_key_loc[gid]; 350 uint loc0 = ikl & 0xff; 351#if MASK_FMT_INT_PLHDR > 1 352#if LOC_1 >= 0 353 uint loc1 = (ikl & 0xff00) >> 8; 354#endif 355#endif 356#if MASK_FMT_INT_PLHDR > 2 357#if LOC_2 >= 0 358 uint loc2 = (ikl & 0xff0000) >> 16; 359#endif 360#endif 361#if MASK_FMT_INT_PLHDR > 3 362#if LOC_3 >= 0 363 uint loc3 = (ikl & 0xff000000) >> 24; 364#endif 365#endif 366#endif 367 368#if !IS_STATIC_GPU_MASK 369#define GPU_LOC_0 loc0 370#define GPU_LOC_1 loc1 371#define GPU_LOC_2 loc2 372#define GPU_LOC_3 loc3 373#else 374#define GPU_LOC_0 LOC_0 375#define GPU_LOC_1 LOC_1 376#define GPU_LOC_2 LOC_2 377#define GPU_LOC_3 LOC_3 378#endif 379 380#if USE_LOCAL_BITMAPS 381 uint lid = get_local_id(0); 382 uint lws = get_local_size(0); 383 uint __local s_bitmaps[(BITMAP_SIZE_BITS >> 5) * SELECT_CMP_STEPS]; 384 385 for (i = 0; i < (((BITMAP_SIZE_BITS >> 5) * SELECT_CMP_STEPS) / lws); i++) 386 s_bitmaps[i*lws + lid] = bitmaps[i*lws + lid]; 387 388 barrier(CLK_LOCAL_MEM_FENCE); 389#endif 390 391 keys += base >> 7; 392 md4_size = prepare_key(keys, md4_size, nt_buffer); 393 md4_size = md4_size << 4; 394 395 for (i = 0; i < NUM_INT_KEYS; i++) { 396#if NUM_INT_KEYS > 1 397 PUTSHORT(nt_buffer, GPU_LOC_0, CP_LUT(int_keys[i] & 0xff)); 398#if MASK_FMT_INT_PLHDR > 1 399#if LOC_1 >= 0 400 PUTSHORT(nt_buffer, GPU_LOC_1, CP_LUT((int_keys[i] & 0xff00) >> 8)); 401#endif 402#endif 403#if MASK_FMT_INT_PLHDR > 2 404#if LOC_2 >= 0 405 PUTSHORT(nt_buffer, GPU_LOC_2, CP_LUT((int_keys[i] & 0xff0000) >> 16)); 406#endif 407#endif 408#if MASK_FMT_INT_PLHDR > 3 409#if LOC_3 >= 0 410 PUTSHORT(nt_buffer, GPU_LOC_3, CP_LUT((int_keys[i] & 0xff000000) >> 24)); 411#endif 412#endif 413#endif 414 nt_crypt(hash, nt_buffer, md4_size); 415 cmp(gid, i, hash, 416#if USE_LOCAL_BITMAPS 417 s_bitmaps 418#else 419 bitmaps 420#endif 421 , offset_table, hash_table, return_hashes, out_hash_ids, bitmap_dupe); 422 } 423} 424