1 // freerainbowtables is a project for generating, distributing, and using
2 // perfect rainbow tables
3 //
4 // Copyright 2010, 2011 Jan Kyska
5 // Copyright 2010 Martin Westergaard Jørgensen <martinwj2005@gmail.com>
6 // Copyright 2010, 2011, 2012 James Nobis <quel@quelrod.net>
7 //
8 // This file is part of freerainbowtables.
9 //
10 // freerainbowtables is free software: you can redistribute it and/or modify
11 // it under the terms of the GNU General Public License as published by
12 // the Free Software Foundation, either version 2 of the License, or
13 // (at your option) any later version.
14 //
15 // freerainbowtables is distributed in the hope that it will be useful,
16 // but WITHOUT ANY WARRANTY; without even the implied warranty of
17 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 // GNU General Public License for more details.
19 //
20 // You should have received a copy of the GNU General Public License
21 // along with freerainbowtables. If not, see <http://www.gnu.org/licenses/>.
22
23
24 #include <stdio.h>
25 #include <cuda.h>
26 #include "rcuda.h"
27
28 #define GRID_X_L2 6
29 #define GRID_Y_L2 6
30 #define BLOCK_X_L2 6
31 #define GRID_X_SIZE (1<<GRID_X_L2)
32 #define GRID_Y_SIZE (1<<GRID_Y_L2)
33 #define BLOCK_X_SIZE (1<<BLOCK_X_L2)
34 #define PLAIN_MAX_SIZE 20
35 //#define KERN_CHAIN_SIZE 100
36 #define CHAR_SET_MAXLEN 512
37 #define SHIDX(x) ((x)<<4)
38 #define DATAIDX(vol, idx) ((((idx)>>4)<<(4+vol))+((idx)&15))
39 #define THREADIDX ((((blockIdx.y<<GRID_X_L2) + blockIdx.x)<<BLOCK_X_L2) + threadIdx.x)
40
41
42 __device__ uint64_t *dataHeap;
43 __constant__ __device__ int *plCpPos;
44 __constant__ __device__ int plCpPosSize;
45 __constant__ __device__ uint64_t reduceOffset;
46 __constant__ __device__ uint64_t plainSpaceTotal;
47 __constant__ __device__ uint64_t rPlainSpaceTotal;
48 __constant__ __device__ unsigned char cplChrSet[CHAR_SET_MAXLEN];
49 __constant__ __device__ uint2 cplDimVec[2*PLAIN_MAX_SIZE];
50 __constant__ __device__ uint64_t cplStartIdx;
51 __constant__ __device__ unsigned int cplTargetHash[8];
52 __constant__ __device__ int cplTargetHashSize;
53 __device__ int termKernel;
54
55 #define RTGEN_PROLOGUE \
56 unsigned int *hData; \
57 unsigned int idx; \
58 uint64_t idx64; \
59 unsigned int nPos, ii; \
60 unsigned int cpcheck, checkpoint; \
61 uint3 dimItem; \
62 uint64_t uiDiv64, uiVal64, uiMul64; \
63 uint64_t plainSpace, idx64mod; \
64 unsigned int uiVal, uiDiv; \
65 unsigned int dimVecIdx; \
66 \
67 __shared__ unsigned int shData[SHIDX(BLOCK_X_SIZE)]; \
68 \
69 if(threadIdx.x == 0) { \
70 nPos = (THREADIDX<<1); \
71 for(ii = 0; ii < BLOCK_X_SIZE; ii++, nPos+=2) { \
72 hData = shData + DATAIDX(4, ii); \
73 hData[SHIDX(0)] = dataHeap[nPos]; \
74 hData[SHIDX(1)] = dataHeap[nPos]>>32; \
75 hData[SHIDX(2)] = dataHeap[nPos+1]; \
76 } \
77 } \
78 __syncthreads(); \
79 \
80 hData = shData + DATAIDX(4, threadIdx.x); \
81 \
82 idx64 = hData[SHIDX(1)]; \
83 idx64 = (idx64<<32) | hData[SHIDX(0)]; \
84 cpcheck = hData[SHIDX(2)]; \
85 checkpoint = cpcheck&0x0000ffff; \
86 cpcheck = cpcheck>>16; \
87 \
88 for(nPos = chainStart; nPos < chainStop; nPos++) {
89
90 #define RTGEN_EPILOGUE \
91 idx64 = hData[SHIDX(1)]; \
92 idx64 = (idx64<<32) | hData[SHIDX(0)]; \
93 idx64 += reduceOffset + nPos; \
94 uiDiv64 = __umul64hi(idx64, rPlainSpaceTotal); \
95 idx64 -= uiDiv64*plainSpaceTotal; \
96 if(idx64 >= plainSpaceTotal) \
97 idx64 -= plainSpaceTotal; \
98 \
99 if(cpcheck < plCpPosSize && nPos == plCpPos[cpcheck]) { \
100 checkpoint |= ((unsigned int)idx64&1) << cpcheck; \
101 cpcheck++; \
102 } \
103 } \
104 \
105 hData[SHIDX(0)] = idx64; \
106 hData[SHIDX(1)] = idx64>>32; \
107 hData[SHIDX(2)] = (cpcheck<<16)|(checkpoint&0x0000ffff); \
108 __syncthreads(); \
109 \
110 if(threadIdx.x == 0) { \
111 nPos = (THREADIDX<<1); \
112 for(ii = 0; ii < BLOCK_X_SIZE; ii++, nPos+=2) { \
113 hData = shData + DATAIDX(4, ii); \
114 dataHeap[nPos] = ((uint64_t)hData[SHIDX(1)]<<32)|(uint64_t)hData[SHIDX(0)]; \
115 dataHeap[nPos+1] = hData[SHIDX(2)]; \
116 } \
117 }
118
119 #define RTGEN_IDX2PLAIN_BEGIN \
120 dimVecIdx = 0; \
121 ii = 0; \
122 do { \
123 plainSpace = (uint64_t)cplDimVec[dimVecIdx].x | ((uint64_t)cplDimVec[dimVecIdx].y<<32); \
124 dimVecIdx++; \
125 uiVal64 = idx64/plainSpace; \
126 idx64mod = idx64 - uiVal64*plainSpace; \
127 idx64 = uiVal64
128
129 #define RTGEN_IDX2PLAIN_END \
130 if(!(dimItem.z&512)) { \
131 while(!((cplDimVec[dimVecIdx].x>>16)&512)) \
132 dimVecIdx++; \
133 dimVecIdx++; \
134 } \
135 \
136 idx64 = idx64mod; \
137 } while(plainSpace > 1)
138
139 #define RTGEN_I2P_LOOP64_BEGIN \
140 dimItem.z = 0; \
141 for(; idx64 > 0xfffffff0ull && ii < PLAIN_MAX_SIZE && !(dimItem.z&512); ii++, dimVecIdx++) { \
142 uiVal64 = idx64; \
143 dimItem.x = (cplDimVec[dimVecIdx].x&0xffffu); \
144 dimItem.y = cplDimVec[dimVecIdx].y; \
145 dimItem.z = (cplDimVec[dimVecIdx].x>>16); \
146 \
147 uiMul64 = (uint64_t)dimItem.y<<32; \
148 idx64 = __umul64hi(uiVal64, uiMul64); \
149 uiDiv64 = uiVal64 - idx64*(uint64_t)dimItem.x; \
150 uiVal = __umulhi((unsigned int)uiDiv64, dimItem.y); \
151 uiDiv = (unsigned int)uiDiv64 - uiVal * dimItem.x; \
152 idx64 += uiVal; \
153 if(uiDiv >= dimItem.x) { \
154 uiDiv -= dimItem.x; \
155 idx64++; \
156 } \
157 \
158 if(dimItem.z&256) { \
159 if(!uiDiv && idx64>0) { \
160 uiDiv = dimItem.x; \
161 idx64--; \
162 } \
163 uiDiv -= (uiDiv>0?1u:0); \
164 }
165
166 #define RTGEN_I2P_LOOP64_END \
167 }
168
169 #define RTGEN_I2P_LOOP32_BEGIN \
170 dimItem.z = 0; \
171 for(idx = (unsigned int)idx64; ii < PLAIN_MAX_SIZE && !(dimItem.z&512); ii++, dimVecIdx++) { \
172 uiVal = idx; \
173 dimItem.x = (cplDimVec[dimVecIdx].x&0xffffu); \
174 dimItem.y = cplDimVec[dimVecIdx].y; \
175 dimItem.z = (cplDimVec[dimVecIdx].x>>16); \
176 \
177 idx = __umulhi(uiVal, dimItem.y); \
178 uiDiv = uiVal - idx*dimItem.x; \
179 if(uiDiv >= dimItem.x) { \
180 uiDiv -= dimItem.x; \
181 idx++; \
182 } \
183 \
184 if(dimItem.z&256) { \
185 if(!uiDiv && !idx) { \
186 dimVecIdx++; \
187 break; \
188 } \
189 if(!uiDiv && idx>0) { \
190 uiDiv = dimItem.x; \
191 idx--; \
192 } \
193 uiDiv -= (uiDiv>0?1u:0); \
194 }
195
196 #define RTGEN_I2P_LOOP32_END \
197 }
198
199
200 /*********** PreCalculate **************/
201
202 #define RTPRECALC_PROLOGUE \
203 unsigned int *hData; \
204 unsigned int idx; \
205 uint64_t idx64; \
206 unsigned int nPos, ii; \
207 unsigned int startNPos; \
208 uint3 dimItem; \
209 uint64_t uiDiv64, uiVal64, uiMul64; \
210 uint64_t plainSpace, idx64mod; \
211 unsigned int uiVal, uiDiv; \
212 unsigned int dimVecIdx; \
213 \
214 __shared__ unsigned int shData[SHIDX(BLOCK_X_SIZE)]; \
215 \
216 if(threadIdx.x == 0) { \
217 nPos = THREADIDX; \
218 for(ii = 0; ii < BLOCK_X_SIZE; ii++, nPos++) { \
219 hData = shData + DATAIDX(4, ii); \
220 hData[SHIDX(0)] = dataHeap[nPos]; \
221 hData[SHIDX(1)] = dataHeap[nPos]>>32; \
222 hData[SHIDX(2)] = (unsigned int)cplStartIdx + nPos + 1; \
223 } \
224 } \
225 __syncthreads(); \
226 \
227 hData = shData + DATAIDX(4, threadIdx.x); \
228 \
229 idx64 = hData[SHIDX(1)]; \
230 idx64 = (idx64<<32) | hData[SHIDX(0)]; \
231 startNPos = hData[SHIDX(2)]; \
232 \
233 for(nPos = (startNPos>chainStart? startNPos : chainStart); nPos < chainStop; nPos++) {
234
235 #define RTPRECALC_EPILOGUE \
236 idx64 = hData[SHIDX(1)]; \
237 idx64 = (idx64<<32) | hData[SHIDX(0)]; \
238 idx64 += reduceOffset + nPos; \
239 uiDiv64 = __umul64hi(idx64, rPlainSpaceTotal); \
240 idx64 -= uiDiv64*plainSpaceTotal; \
241 if(idx64 >= plainSpaceTotal) \
242 idx64 -= plainSpaceTotal; \
243 \
244 } \
245 \
246 hData[SHIDX(0)] = idx64; \
247 hData[SHIDX(1)] = idx64>>32; \
248 __syncthreads(); \
249 \
250 if(threadIdx.x == 0) { \
251 nPos = THREADIDX; \
252 for(ii = 0; ii < BLOCK_X_SIZE; ii++, nPos++) { \
253 hData = shData + DATAIDX(4, ii); \
254 dataHeap[nPos] = ((uint64_t)hData[SHIDX(1)]<<32)|(uint64_t)hData[SHIDX(0)]; \
255 } \
256 }
257
258
259 /*********** CheckAlarm **************/
260 #define PARAMIDX_ALARM 0
261 #define PARAMIDX_GUESSEDPOS 1
262
263 #define RTCHKALARM_PROLOGUE \
264 unsigned int *hData, *hDataX; \
265 unsigned int idx; \
266 uint64_t idx64, lastIdx64; \
267 unsigned int nPos, ii; \
268 unsigned int stopNPos; \
269 uint3 dimItem; \
270 uint64_t uiDiv64, uiVal64, uiMul64; \
271 uint64_t plainSpace, idx64mod; \
272 unsigned int uiVal, uiDiv; \
273 unsigned int dimVecIdx; \
274 \
275 __shared__ unsigned int shData[SHIDX(BLOCK_X_SIZE)]; \
276 __shared__ unsigned int shDataX[BLOCK_X_SIZE<<1]; \
277 \
278 if(threadIdx.x == 0) { \
279 nPos = (THREADIDX<<1); \
280 for(ii = 0; ii < BLOCK_X_SIZE; ii++, nPos+=2) { \
281 hData = shData + DATAIDX(4, ii); \
282 hDataX = shDataX + DATAIDX(1, ii); \
283 hData[SHIDX(0)] = dataHeap[nPos]; \
284 hData[SHIDX(1)] = dataHeap[nPos]>>32; \
285 hDataX[SHIDX(PARAMIDX_ALARM)] = dataHeap[nPos+1]>>32; \
286 hDataX[SHIDX(PARAMIDX_GUESSEDPOS)] = dataHeap[nPos+1]; \
287 } \
288 } \
289 __syncthreads(); \
290 \
291 hData = shData + DATAIDX(4, threadIdx.x); \
292 hDataX = shDataX + DATAIDX(1, threadIdx.x); \
293 \
294 idx64 = hData[SHIDX(1)]; \
295 idx64 = (idx64<<32) | hData[SHIDX(0)]; \
296 stopNPos = hDataX[SHIDX(PARAMIDX_GUESSEDPOS)]; \
297 if(chainStart > stopNPos || (stopNPos == 0xfffffffful && hDataX[SHIDX(PARAMIDX_ALARM)] == 0xfffffffful)) \
298 return; \
299 chainStop = (chainStop>stopNPos? stopNPos : chainStop); \
300 \
301 lastIdx64 = 0; \
302 for(nPos = chainStart; nPos <= chainStop; nPos++) { \
303 lastIdx64 = idx64;
304
305 #define RTCHKALARM_EPILOGUE \
306 idx64 = hData[SHIDX(1)]; \
307 idx64 = (idx64<<32) | hData[SHIDX(0)]; \
308 idx64 += reduceOffset + nPos; \
309 uiDiv64 = __umul64hi(idx64, rPlainSpaceTotal); \
310 idx64 -= uiDiv64*plainSpaceTotal; \
311 if(idx64 >= plainSpaceTotal) \
312 idx64 -= plainSpaceTotal; \
313 \
314 } \
315 \
316 if(nPos <= stopNPos) { \
317 hData[SHIDX(0)] = idx64; \
318 hData[SHIDX(1)] = idx64>>32; \
319 hDataX[SHIDX(PARAMIDX_ALARM)] = 0; \
320 } else { \
321 uiVal = 1; \
322 for(nPos = 0; nPos < cplTargetHashSize; nPos++) \
323 uiVal = (cplTargetHash[nPos]==hData[SHIDX(nPos)]? uiVal : 0); \
324 hData[SHIDX(0)] = lastIdx64; \
325 hData[SHIDX(1)] = lastIdx64>>32; \
326 hDataX[SHIDX(PARAMIDX_ALARM)] = uiVal; \
327 } \
328 __syncthreads(); \
329 \
330 if(threadIdx.x == 0) { \
331 nPos = (THREADIDX<<1); \
332 for(ii = 0; ii < BLOCK_X_SIZE; ii++, nPos+=2) { \
333 hData = shData + DATAIDX(4, ii); \
334 hDataX = shDataX + DATAIDX(1, ii); \
335 dataHeap[nPos] = ((uint64_t)hData[SHIDX(1)]<<32)|(uint64_t)hData[SHIDX(0)]; \
336 if(hDataX[SHIDX(PARAMIDX_ALARM)]) { \
337 dataHeap[nPos+1] = 0xffffffffffffffffull; \
338 termKernel = 1; \
339 } \
340 } \
341 }
342
343
344 #include "rcuda_md5.inc"
345 #include "rcuda_md4.inc"
346 #include "rcuda_ntlm.inc"
347 #include "rcuda_sha1.inc"
348 #include "rcuda_lm.inc"
349
CalcChainsOnCUDA(const rcuda::RCudaTask * task,uint64_t * resultBuff)350 extern "C" int CalcChainsOnCUDA(const rcuda::RCudaTask* task, uint64_t *resultBuff) {
351 cudaSetDeviceFlags(cudaDeviceBlockingSync);
352
353 cudaError_t cuErr;
354 uint64_t *data;
355 int *cpPos;
356 uint64_t uiVal64;
357
358 if(task->charSetSize > CHAR_SET_MAXLEN)
359 return -1;
360
361 switch(task->hash) {
362 case rcuda::RHASH_MD5:
363 case rcuda::RHASH_MD4:
364 case rcuda::RHASH_NTLM:
365 case rcuda::RHASH_SHA1:
366 case rcuda::RHASH_MYSQLSHA1:
367 case rcuda::RHASH_LM:
368 break;
369 default:
370 return 0;
371 }
372
373 cudaMalloc((void**)&data, (task->idxCount+BLOCK_X_SIZE)*2*sizeof(uint64_t));
374 cudaMalloc((void**)&cpPos, task->cpPosSize*sizeof(int));
375
376 cudaMemcpy(data, resultBuff, task->idxCount*2*sizeof(uint64_t), cudaMemcpyHostToDevice);
377 cudaMemcpy(cpPos, task->cpPositions, task->cpPosSize*sizeof(int), cudaMemcpyHostToDevice);
378
379 cudaMemcpyToSymbol(dataHeap, &data, sizeof(data));
380 cudaMemcpyToSymbol(cplDimVec, task->dimVec, min((unsigned long long)task->dimVecSize*sizeof(uint2), (unsigned long long)sizeof(cplDimVec)));
381 cudaMemcpyToSymbol(cplChrSet, task->charSet, min(task->charSetSize, CHAR_SET_MAXLEN));
382 cudaMemcpyToSymbol(plCpPos, &cpPos, sizeof(cpPos));
383 cudaMemcpyToSymbol(plCpPosSize, &task->cpPosSize, sizeof(task->cpPosSize));
384 cudaMemcpyToSymbol(reduceOffset, &task->reduceOffset, sizeof(task->reduceOffset));
385 cudaMemcpyToSymbol(plainSpaceTotal, &task->plainSpaceTotal, sizeof(task->plainSpaceTotal));
386 uiVal64 = (uint64_t)-1/task->plainSpaceTotal;
387 cudaMemcpyToSymbol(rPlainSpaceTotal, &uiVal64, sizeof(uiVal64));
388
389 int grSizeX = (task->idxCount-1)/BLOCK_X_SIZE + 1;
390 int grSizeY = (grSizeX-1)/GRID_X_SIZE + 1;
391 grSizeX = GRID_X_SIZE;
392 dim3 numBlocks(grSizeX, grSizeY);
393 cuErr = cudaSuccess;
394
395 for(unsigned int idx = 0; idx < task->rainbowChainLen-1 && cuErr == cudaSuccess; idx+=task->kernChainSize) {
396 switch(task->hash) {
397 case rcuda::RHASH_MD5:
398 RTGenMD5Kernel<<<numBlocks, BLOCK_X_SIZE>>>(idx, min(idx+task->kernChainSize, task->rainbowChainLen-1));
399 break;
400 case rcuda::RHASH_MD4:
401 RTGenMD4Kernel<<<numBlocks, BLOCK_X_SIZE>>>(idx, min(idx+task->kernChainSize, task->rainbowChainLen-1));
402 break;
403 case rcuda::RHASH_NTLM:
404 RTGenNTLMKernel<<<numBlocks, BLOCK_X_SIZE>>>(idx, min(idx+task->kernChainSize, task->rainbowChainLen-1));
405 break;
406 case rcuda::RHASH_SHA1:
407 RTGenSHA1Kernel<<<numBlocks, BLOCK_X_SIZE>>>(idx, min(idx+task->kernChainSize, task->rainbowChainLen-1));
408 break;
409 case rcuda::RHASH_MYSQLSHA1:
410 RTGenMySQLSHA1Kernel<<<numBlocks, BLOCK_X_SIZE>>>(idx, min(idx+task->kernChainSize, task->rainbowChainLen-1));
411 break;
412 case rcuda::RHASH_LM:
413 RTGenLMKernel<<<numBlocks, BLOCK_X_SIZE>>>(idx, min(idx+task->kernChainSize, task->rainbowChainLen-1));
414 break;
415 case rcuda::RHASH_UNDEF:
416 break;
417 }
418
419 cuErr = cudaThreadSynchronize();
420 }
421
422 if(cuErr == cudaSuccess)
423 cudaMemcpy(resultBuff, data, task->idxCount*2*sizeof(uint64_t), cudaMemcpyDeviceToHost);
424 else
425 fprintf(stderr, "Error happened: %d (%s)\n", cuErr, cudaGetErrorString(cuErr));
426
427 if(task->cpPosSize > 0) {
428 cudaFree(cpPos);
429 }
430 cudaFree(data);
431 return cuErr==cudaSuccess? task->idxCount : -1;
432 }
433
PreCalculateOnCUDA(const rcuda::RCudaTask * task,uint64_t * resultBuff)434 extern "C" int PreCalculateOnCUDA(const rcuda::RCudaTask* task, uint64_t *resultBuff) {
435 cudaSetDeviceFlags(cudaDeviceBlockingSync);
436
437 cudaError_t cuErr;
438 uint64_t *data;
439 uint64_t uiVal64;
440
441 if(task->charSetSize > CHAR_SET_MAXLEN)
442 return -1;
443
444 switch(task->hash) {
445 case rcuda::RHASH_MD5:
446 case rcuda::RHASH_MD4:
447 case rcuda::RHASH_NTLM:
448 case rcuda::RHASH_SHA1:
449 case rcuda::RHASH_MYSQLSHA1:
450 case rcuda::RHASH_LM:
451 break;
452 default:
453 return 0;
454 }
455
456 cudaMalloc((void**)&data, (task->idxCount+BLOCK_X_SIZE)*sizeof(uint64_t));
457 cudaMemcpy(data, resultBuff, task->idxCount*sizeof(uint64_t), cudaMemcpyHostToDevice);
458
459 cudaMemcpyToSymbol(dataHeap, &data, sizeof(data));
460 cudaMemcpyToSymbol(cplDimVec, task->dimVec, min((unsigned long long)task->dimVecSize*sizeof(uint2),(unsigned long long)sizeof(cplDimVec)));
461 cudaMemcpyToSymbol(cplChrSet, task->charSet, min(task->charSetSize, CHAR_SET_MAXLEN));
462 cudaMemcpyToSymbol(reduceOffset, &task->reduceOffset, sizeof(task->reduceOffset));
463 cudaMemcpyToSymbol(plainSpaceTotal, &task->plainSpaceTotal, sizeof(task->plainSpaceTotal));
464 uiVal64 = (uint64_t)-1/task->plainSpaceTotal;
465 cudaMemcpyToSymbol(rPlainSpaceTotal, &uiVal64, sizeof(uiVal64));
466 cudaMemcpyToSymbol(cplStartIdx, &task->startIdx, sizeof(task->startIdx));
467
468 int grSizeX = (task->idxCount-1)/BLOCK_X_SIZE + 1;
469 int grSizeY = (grSizeX-1)/GRID_X_SIZE + 1;
470 grSizeX = GRID_X_SIZE;
471 dim3 numBlocks(grSizeX, grSizeY);
472 cuErr = cudaSuccess;
473
474 for(unsigned int idx = (unsigned int)task->startIdx; idx < task->rainbowChainLen-1 && cuErr == cudaSuccess; idx+=task->kernChainSize) {
475 switch(task->hash) {
476 case rcuda::RHASH_MD5:
477 RTPreCalcMD5Kernel<<<numBlocks, BLOCK_X_SIZE>>>(idx, min(idx+task->kernChainSize, task->rainbowChainLen-1));
478 break;
479 case rcuda::RHASH_MD4:
480 RTPreCalcMD4Kernel<<<numBlocks, BLOCK_X_SIZE>>>(idx, min(idx+task->kernChainSize, task->rainbowChainLen-1));
481 break;
482 case rcuda::RHASH_NTLM:
483 RTPreCalcNTLMKernel<<<numBlocks, BLOCK_X_SIZE>>>(idx, min(idx+task->kernChainSize, task->rainbowChainLen-1));
484 break;
485 case rcuda::RHASH_SHA1:
486 RTPreCalcSHA1Kernel<<<numBlocks, BLOCK_X_SIZE>>>(idx, min(idx+task->kernChainSize, task->rainbowChainLen-1));
487 break;
488 case rcuda::RHASH_MYSQLSHA1:
489 RTPreCalcMySQLSHA1Kernel<<<numBlocks, BLOCK_X_SIZE>>>(idx, min(idx+task->kernChainSize, task->rainbowChainLen-1));
490 break;
491 case rcuda::RHASH_LM:
492 RTPreCalcKernel<<<numBlocks, BLOCK_X_SIZE>>>(idx, min(idx+task->kernChainSize, task->rainbowChainLen-1));
493 break;
494 case rcuda::RHASH_UNDEF:
495 break;
496 }
497
498 cuErr = cudaThreadSynchronize();
499 }
500
501 if(cuErr == cudaSuccess)
502 cudaMemcpy(resultBuff, data, task->idxCount*sizeof(uint64_t), cudaMemcpyDeviceToHost);
503 else
504 fprintf(stderr, "Error happened: %d (%s)\n", cuErr, cudaGetErrorString(cuErr));
505
506 cudaFree(data);
507 return cuErr==cudaSuccess? task->idxCount : -1;
508 }
509
CheckAlarmOnCUDA(const rcuda::RCudaTask * task,uint64_t * resultBuff)510 extern "C" int CheckAlarmOnCUDA(const rcuda::RCudaTask* task, uint64_t *resultBuff) {
511 cudaSetDeviceFlags(cudaDeviceBlockingSync);
512
513 cudaError_t cuErr;
514 uint64_t *data;
515 uint64_t uiVal64;
516 int isize;
517
518 if(task->charSetSize > CHAR_SET_MAXLEN)
519 return -1;
520
521 isize = 0;
522 switch(task->hash) {
523 case rcuda::RHASH_MD5:
524 case rcuda::RHASH_MD4:
525 case rcuda::RHASH_NTLM:
526 isize = (isize==0? 16 : isize);
527 case rcuda::RHASH_SHA1:
528 case rcuda::RHASH_MYSQLSHA1:
529 isize = (isize==0? 20 : isize);
530 case rcuda::RHASH_LM:
531 isize = (isize==0? 8 : isize);
532 cudaMemcpyToSymbol(cplTargetHash, task->targetHash, isize);
533 isize = (isize>>2);
534 cudaMemcpyToSymbol(cplTargetHashSize, &isize, sizeof(isize));
535 break;
536 default:
537 return 0;
538 }
539
540 cudaMalloc((void**)&data, (task->idxCount+BLOCK_X_SIZE)*2*sizeof(uint64_t));
541 cudaMemcpy(data, resultBuff, task->idxCount*2*sizeof(uint64_t), cudaMemcpyHostToDevice);
542
543 cudaMemcpyToSymbol(dataHeap, &data, sizeof(data));
544 cudaMemcpyToSymbol(cplDimVec, task->dimVec, min((unsigned long long)task->dimVecSize*sizeof(uint2), (unsigned long long)sizeof(cplDimVec)));
545 cudaMemcpyToSymbol(cplChrSet, task->charSet, min(task->charSetSize, CHAR_SET_MAXLEN));
546 cudaMemcpyToSymbol(reduceOffset, &task->reduceOffset, sizeof(task->reduceOffset));
547 cudaMemcpyToSymbol(plainSpaceTotal, &task->plainSpaceTotal, sizeof(task->plainSpaceTotal));
548 uiVal64 = (uint64_t)-1/task->plainSpaceTotal;
549 cudaMemcpyToSymbol(rPlainSpaceTotal, &uiVal64, sizeof(uiVal64));
550
551 int grSizeX = (task->idxCount-1)/BLOCK_X_SIZE + 1;
552 int grSizeY = (grSizeX-1)/GRID_X_SIZE + 1;
553 grSizeX = GRID_X_SIZE;
554 dim3 numBlocks(grSizeX, grSizeY);
555 cuErr = cudaSuccess;
556
557 isize = 0;
558 cudaMemcpyToSymbol(termKernel, &isize, sizeof(isize));
559 for(unsigned int idx = 0; idx <= task->rainbowChainLen && cuErr == cudaSuccess; idx+=task->kernChainSize) {
560 switch(task->hash) {
561 case rcuda::RHASH_MD5:
562 RTCheckAlarmMD5Kernel<<<numBlocks, BLOCK_X_SIZE>>>(idx, min(idx+task->kernChainSize-1, task->rainbowChainLen));
563 break;
564 case rcuda::RHASH_MD4:
565 RTCheckAlarmMD4Kernel<<<numBlocks, BLOCK_X_SIZE>>>(idx, min(idx+task->kernChainSize-1, task->rainbowChainLen));
566 break;
567 case rcuda::RHASH_NTLM:
568 RTCheckAlarmNTLMKernel<<<numBlocks, BLOCK_X_SIZE>>>(idx, min(idx+task->kernChainSize-1, task->rainbowChainLen));
569 break;
570 case rcuda::RHASH_SHA1:
571 RTCheckAlarmSHA1Kernel<<<numBlocks, BLOCK_X_SIZE>>>(idx, min(idx+task->kernChainSize-1, task->rainbowChainLen));
572 break;
573 case rcuda::RHASH_MYSQLSHA1:
574 RTCheckAlarmMySQLSHA1Kernel<<<numBlocks, BLOCK_X_SIZE>>>(idx, min(idx+task->kernChainSize-1, task->rainbowChainLen));
575 break;
576 case rcuda::RHASH_LM:
577 RTCheckAlarmLMKernel<<<numBlocks, BLOCK_X_SIZE>>>(idx, min(idx+task->kernChainSize-1, task->rainbowChainLen));
578 break;
579 case rcuda::RHASH_UNDEF:
580 break;
581 }
582
583 cuErr = cudaThreadSynchronize();
584 cudaMemcpyFromSymbol(&isize, termKernel, sizeof(isize));
585 if(isize)
586 break;
587 }
588
589 if(cuErr == cudaSuccess)
590 cudaMemcpy(resultBuff, data, task->idxCount*2*sizeof(uint64_t), cudaMemcpyDeviceToHost);
591 else
592 fprintf(stderr, "Error happened: %d (%s)\n", cuErr, cudaGetErrorString(cuErr));
593
594 cudaFree(data);
595 return cuErr==cudaSuccess? task->idxCount : -1;
596 }
597
GetChainsBufferSize(int minSize)598 extern "C" int GetChainsBufferSize(int minSize) {
599 int grSizeX = (minSize-1)/BLOCK_X_SIZE + 1;
600 int grSizeY = (grSizeX-1)/GRID_X_SIZE + 1;
601 grSizeX = GRID_X_SIZE;
602 return grSizeX*grSizeY*BLOCK_X_SIZE;
603 }
604
SetCudaDevice(int device)605 extern "C" int SetCudaDevice(int device) {
606 return cudaSetDevice(device);
607 }
608