1 // freerainbowtables is a project for generating, distributing, and using
2 // perfect rainbow tables
3 //
4 // Copyright 2010, 2011 Jan Kyska
5 // Copyright 2010 Martin Westergaard Jørgensen <martinwj2005@gmail.com>
6 // Copyright 2010, 2011, 2012 James Nobis <quel@quelrod.net>
7 //
8 // This file is part of freerainbowtables.
9 //
10 // freerainbowtables is free software: you can redistribute it and/or modify
11 // it under the terms of the GNU General Public License as published by
12 // the Free Software Foundation, either version 2 of the License, or
13 // (at your option) any later version.
14 //
15 // freerainbowtables is distributed in the hope that it will be useful,
16 // but WITHOUT ANY WARRANTY; without even the implied warranty of
17 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18 // GNU General Public License for more details.
19 //
20 // You should have received a copy of the GNU General Public License
21 // along with freerainbowtables.  If not, see <http://www.gnu.org/licenses/>.
22 
23 
24 #include <stdio.h>
25 #include <cuda.h>
26 #include "rcuda.h"
27 
28 #define   GRID_X_L2     6
29 #define   GRID_Y_L2     6
30 #define   BLOCK_X_L2    6
31 #define   GRID_X_SIZE   (1<<GRID_X_L2)
32 #define   GRID_Y_SIZE   (1<<GRID_Y_L2)
33 #define   BLOCK_X_SIZE  (1<<BLOCK_X_L2)
34 #define   PLAIN_MAX_SIZE     20
35 //#define   KERN_CHAIN_SIZE   100
36 #define   CHAR_SET_MAXLEN   512
37 #define   SHIDX(x)      ((x)<<4)
38 #define   DATAIDX(vol, idx)  ((((idx)>>4)<<(4+vol))+((idx)&15))
39 #define   THREADIDX  ((((blockIdx.y<<GRID_X_L2) + blockIdx.x)<<BLOCK_X_L2) + threadIdx.x)
40 
41 
42 __device__ uint64_t *dataHeap;
43 __constant__ __device__ int *plCpPos;
44 __constant__ __device__ int plCpPosSize;
45 __constant__ __device__ uint64_t reduceOffset;
46 __constant__ __device__ uint64_t plainSpaceTotal;
47 __constant__ __device__ uint64_t rPlainSpaceTotal;
48 __constant__ __device__ unsigned char cplChrSet[CHAR_SET_MAXLEN];
49 __constant__ __device__ uint2 cplDimVec[2*PLAIN_MAX_SIZE];
50 __constant__ __device__ uint64_t cplStartIdx;
51 __constant__ __device__ unsigned int cplTargetHash[8];
52 __constant__ __device__ int cplTargetHashSize;
53 __device__ int termKernel;
54 
55 #define RTGEN_PROLOGUE  \
56 	unsigned int *hData;  \
57 	unsigned int idx;  \
58 	uint64_t idx64;  \
59 	unsigned int nPos, ii;  \
60 	unsigned int cpcheck, checkpoint;  \
61 	uint3 dimItem;  \
62 	uint64_t uiDiv64, uiVal64, uiMul64;  \
63 	uint64_t plainSpace, idx64mod;  \
64 	unsigned int uiVal, uiDiv;  \
65 	unsigned int dimVecIdx;  \
66 	\
67 	__shared__ unsigned int shData[SHIDX(BLOCK_X_SIZE)];  \
68 	\
69 	if(threadIdx.x == 0) {  \
70 		nPos = (THREADIDX<<1);  \
71 		for(ii = 0; ii < BLOCK_X_SIZE; ii++, nPos+=2) {  \
72 			hData = shData + DATAIDX(4, ii);  \
73 			hData[SHIDX(0)] = dataHeap[nPos];  \
74 			hData[SHIDX(1)] = dataHeap[nPos]>>32;  \
75 			hData[SHIDX(2)] = dataHeap[nPos+1];  \
76 		}  \
77 	}  \
78 	__syncthreads();  \
79 	\
80 	hData = shData + DATAIDX(4, threadIdx.x);  \
81 	\
82 	idx64 = hData[SHIDX(1)];  \
83 	idx64 = (idx64<<32) | hData[SHIDX(0)];  \
84 	cpcheck = hData[SHIDX(2)];  \
85 	checkpoint = cpcheck&0x0000ffff;  \
86 	cpcheck = cpcheck>>16;  \
87 	\
88 	for(nPos = chainStart; nPos < chainStop; nPos++) {
89 
90 #define RTGEN_EPILOGUE  \
91 		idx64 = hData[SHIDX(1)];  \
92 		idx64 = (idx64<<32) | hData[SHIDX(0)];  \
93 		idx64 += reduceOffset + nPos;  \
94 		uiDiv64 = __umul64hi(idx64, rPlainSpaceTotal);  \
95 		idx64 -= uiDiv64*plainSpaceTotal;  \
96 		if(idx64 >= plainSpaceTotal)  \
97 			idx64 -= plainSpaceTotal;  \
98 		\
99 		if(cpcheck < plCpPosSize && nPos == plCpPos[cpcheck]) {  \
100 			checkpoint |= ((unsigned int)idx64&1) << cpcheck;  \
101 			cpcheck++;  \
102 		}  \
103 	}  \
104 	\
105 	hData[SHIDX(0)] = idx64;  \
106 	hData[SHIDX(1)] = idx64>>32;  \
107 	hData[SHIDX(2)] = (cpcheck<<16)|(checkpoint&0x0000ffff);  \
108 	__syncthreads();  \
109 	\
110 	if(threadIdx.x == 0) {  \
111 		nPos = (THREADIDX<<1);  \
112 		for(ii = 0; ii < BLOCK_X_SIZE; ii++, nPos+=2) {  \
113 			hData = shData + DATAIDX(4, ii);  \
114 			dataHeap[nPos] = ((uint64_t)hData[SHIDX(1)]<<32)|(uint64_t)hData[SHIDX(0)];  \
115 			dataHeap[nPos+1] = hData[SHIDX(2)];  \
116 		}  \
117 	}
118 
119 #define RTGEN_IDX2PLAIN_BEGIN  \
120 	dimVecIdx = 0;  \
121 	ii = 0;  \
122 	do {  \
123 		plainSpace = (uint64_t)cplDimVec[dimVecIdx].x | ((uint64_t)cplDimVec[dimVecIdx].y<<32);  \
124 		dimVecIdx++;  \
125 		uiVal64 = idx64/plainSpace;  \
126 		idx64mod = idx64 - uiVal64*plainSpace;  \
127 		idx64 = uiVal64
128 
129 #define RTGEN_IDX2PLAIN_END  \
130 		if(!(dimItem.z&512)) {  \
131 			while(!((cplDimVec[dimVecIdx].x>>16)&512))  \
132 				dimVecIdx++;  \
133 			dimVecIdx++;  \
134 		}  \
135 		\
136 		idx64 = idx64mod;  \
137 	} while(plainSpace > 1)
138 
139 #define RTGEN_I2P_LOOP64_BEGIN  \
140 	dimItem.z = 0;  \
141 	for(; idx64 > 0xfffffff0ull && ii < PLAIN_MAX_SIZE && !(dimItem.z&512); ii++, dimVecIdx++) {  \
142 		uiVal64 = idx64;  \
143 		dimItem.x = (cplDimVec[dimVecIdx].x&0xffffu);  \
144 		dimItem.y = cplDimVec[dimVecIdx].y;  \
145 		dimItem.z = (cplDimVec[dimVecIdx].x>>16);  \
146 		\
147 		uiMul64 = (uint64_t)dimItem.y<<32;  \
148 		idx64 = __umul64hi(uiVal64, uiMul64);  \
149 		uiDiv64 = uiVal64 - idx64*(uint64_t)dimItem.x;  \
150 		uiVal = __umulhi((unsigned int)uiDiv64, dimItem.y);  \
151 		uiDiv = (unsigned int)uiDiv64 - uiVal * dimItem.x;  \
152 		idx64 += uiVal;  \
153 		if(uiDiv >= dimItem.x) {  \
154 			uiDiv -= dimItem.x;  \
155 			idx64++;  \
156 		}  \
157 		\
158 		if(dimItem.z&256) {  \
159 			if(!uiDiv && idx64>0) {  \
160 				uiDiv = dimItem.x;  \
161 				idx64--;  \
162 			}  \
163 			uiDiv -= (uiDiv>0?1u:0);  \
164 		}
165 
166 #define RTGEN_I2P_LOOP64_END  \
167 	}
168 
169 #define RTGEN_I2P_LOOP32_BEGIN  \
170 	dimItem.z = 0;  \
171 	for(idx = (unsigned int)idx64; ii < PLAIN_MAX_SIZE && !(dimItem.z&512); ii++, dimVecIdx++) {  \
172 		uiVal = idx;  \
173 		dimItem.x = (cplDimVec[dimVecIdx].x&0xffffu);  \
174 		dimItem.y = cplDimVec[dimVecIdx].y;  \
175 		dimItem.z = (cplDimVec[dimVecIdx].x>>16);  \
176 		\
177 		idx = __umulhi(uiVal, dimItem.y);  \
178 		uiDiv = uiVal - idx*dimItem.x;  \
179 		if(uiDiv >= dimItem.x) {  \
180 			uiDiv -= dimItem.x;  \
181 			idx++;  \
182 		}  \
183 		\
184 		if(dimItem.z&256) {  \
185 			if(!uiDiv && !idx) {  \
186 				dimVecIdx++;  \
187 				break;  \
188 			}  \
189 			if(!uiDiv && idx>0) {  \
190 				uiDiv = dimItem.x;  \
191 				idx--;  \
192 			}  \
193 			uiDiv -= (uiDiv>0?1u:0);  \
194 		}
195 
196 #define RTGEN_I2P_LOOP32_END  \
197 	}
198 
199 
200 /***********   PreCalculate  **************/
201 
202 #define RTPRECALC_PROLOGUE  \
203 	unsigned int *hData;  \
204 	unsigned int idx;  \
205 	uint64_t idx64;  \
206 	unsigned int nPos, ii;  \
207 	unsigned int startNPos;  \
208 	uint3 dimItem;  \
209 	uint64_t uiDiv64, uiVal64, uiMul64;  \
210 	uint64_t plainSpace, idx64mod;  \
211 	unsigned int uiVal, uiDiv;  \
212 	unsigned int dimVecIdx;  \
213 	\
214 	__shared__ unsigned int shData[SHIDX(BLOCK_X_SIZE)];  \
215 	\
216 	if(threadIdx.x == 0) {  \
217 		nPos = THREADIDX;  \
218 		for(ii = 0; ii < BLOCK_X_SIZE; ii++, nPos++) {  \
219 			hData = shData + DATAIDX(4, ii);  \
220 			hData[SHIDX(0)] = dataHeap[nPos];  \
221 			hData[SHIDX(1)] = dataHeap[nPos]>>32;  \
222 			hData[SHIDX(2)] = (unsigned int)cplStartIdx + nPos + 1;  \
223 		}  \
224 	}  \
225 	__syncthreads();  \
226 	\
227 	hData = shData + DATAIDX(4, threadIdx.x);  \
228 	\
229 	idx64 = hData[SHIDX(1)];  \
230 	idx64 = (idx64<<32) | hData[SHIDX(0)];  \
231 	startNPos = hData[SHIDX(2)];  \
232 	\
233 	for(nPos = (startNPos>chainStart? startNPos : chainStart); nPos < chainStop; nPos++) {
234 
235 #define RTPRECALC_EPILOGUE  \
236 		idx64 = hData[SHIDX(1)];  \
237 		idx64 = (idx64<<32) | hData[SHIDX(0)];  \
238 		idx64 += reduceOffset + nPos;  \
239 		uiDiv64 = __umul64hi(idx64, rPlainSpaceTotal);  \
240 		idx64 -= uiDiv64*plainSpaceTotal;  \
241 		if(idx64 >= plainSpaceTotal)  \
242 			idx64 -= plainSpaceTotal;  \
243 		\
244 	}  \
245 	\
246 	hData[SHIDX(0)] = idx64;  \
247 	hData[SHIDX(1)] = idx64>>32;  \
248 	__syncthreads();  \
249 	\
250 	if(threadIdx.x == 0) {  \
251 		nPos = THREADIDX;  \
252 		for(ii = 0; ii < BLOCK_X_SIZE; ii++, nPos++) {  \
253 			hData = shData + DATAIDX(4, ii);  \
254 			dataHeap[nPos] = ((uint64_t)hData[SHIDX(1)]<<32)|(uint64_t)hData[SHIDX(0)];  \
255 		}  \
256 	}
257 
258 
259 /***********   CheckAlarm  **************/
260 #define  PARAMIDX_ALARM         0
261 #define  PARAMIDX_GUESSEDPOS    1
262 
263 #define RTCHKALARM_PROLOGUE  \
264 	unsigned int *hData, *hDataX;  \
265 	unsigned int idx;  \
266 	uint64_t idx64, lastIdx64;  \
267 	unsigned int nPos, ii;  \
268 	unsigned int stopNPos;  \
269 	uint3 dimItem;  \
270 	uint64_t uiDiv64, uiVal64, uiMul64;  \
271 	uint64_t plainSpace, idx64mod;  \
272 	unsigned int uiVal, uiDiv;  \
273 	unsigned int dimVecIdx;  \
274 	\
275 	__shared__ unsigned int shData[SHIDX(BLOCK_X_SIZE)];  \
276 	__shared__ unsigned int shDataX[BLOCK_X_SIZE<<1];  \
277 	\
278 	if(threadIdx.x == 0) {  \
279 		nPos = (THREADIDX<<1);  \
280 		for(ii = 0; ii < BLOCK_X_SIZE; ii++, nPos+=2) {  \
281 			hData = shData + DATAIDX(4, ii);  \
282 			hDataX = shDataX + DATAIDX(1, ii);  \
283 			hData[SHIDX(0)] = dataHeap[nPos];  \
284 			hData[SHIDX(1)] = dataHeap[nPos]>>32;  \
285 			hDataX[SHIDX(PARAMIDX_ALARM)] = dataHeap[nPos+1]>>32;  \
286 			hDataX[SHIDX(PARAMIDX_GUESSEDPOS)] = dataHeap[nPos+1];  \
287 		}  \
288 	}  \
289 	__syncthreads();  \
290 	\
291 	hData = shData + DATAIDX(4, threadIdx.x);  \
292 	hDataX = shDataX + DATAIDX(1, threadIdx.x);  \
293 	\
294 	idx64 = hData[SHIDX(1)];  \
295 	idx64 = (idx64<<32) | hData[SHIDX(0)];  \
296 	stopNPos = hDataX[SHIDX(PARAMIDX_GUESSEDPOS)];  \
297 	if(chainStart > stopNPos || (stopNPos == 0xfffffffful && hDataX[SHIDX(PARAMIDX_ALARM)] == 0xfffffffful))  \
298 		return; \
299 	chainStop = (chainStop>stopNPos? stopNPos : chainStop); \
300 	\
301 	lastIdx64 = 0; \
302 	for(nPos = chainStart; nPos <= chainStop; nPos++) { \
303 		lastIdx64 = idx64;
304 
305 #define RTCHKALARM_EPILOGUE  \
306 		idx64 = hData[SHIDX(1)];  \
307 		idx64 = (idx64<<32) | hData[SHIDX(0)];  \
308 		idx64 += reduceOffset + nPos;  \
309 		uiDiv64 = __umul64hi(idx64, rPlainSpaceTotal);  \
310 		idx64 -= uiDiv64*plainSpaceTotal;  \
311 		if(idx64 >= plainSpaceTotal)  \
312 			idx64 -= plainSpaceTotal;  \
313 		\
314 	}  \
315 	\
316 	if(nPos <= stopNPos) {  \
317 		hData[SHIDX(0)] = idx64;  \
318 		hData[SHIDX(1)] = idx64>>32;  \
319 		hDataX[SHIDX(PARAMIDX_ALARM)] = 0;  \
320 	} else {  \
321 		uiVal = 1;  \
322 		for(nPos = 0; nPos < cplTargetHashSize; nPos++)  \
323 			uiVal = (cplTargetHash[nPos]==hData[SHIDX(nPos)]? uiVal : 0);  \
324 		hData[SHIDX(0)] = lastIdx64;  \
325 		hData[SHIDX(1)] = lastIdx64>>32;  \
326 		hDataX[SHIDX(PARAMIDX_ALARM)] = uiVal;  \
327 	}  \
328 	__syncthreads();  \
329 	\
330 	if(threadIdx.x == 0) {  \
331 		nPos = (THREADIDX<<1);  \
332 		for(ii = 0; ii < BLOCK_X_SIZE; ii++, nPos+=2) {  \
333 			hData = shData + DATAIDX(4, ii);  \
334 			hDataX = shDataX + DATAIDX(1, ii);  \
335 			dataHeap[nPos] = ((uint64_t)hData[SHIDX(1)]<<32)|(uint64_t)hData[SHIDX(0)];  \
336 			if(hDataX[SHIDX(PARAMIDX_ALARM)]) {  \
337 				dataHeap[nPos+1] = 0xffffffffffffffffull;  \
338 				termKernel = 1;  \
339 			}  \
340 		}  \
341 	}
342 
343 
344 #include "rcuda_md5.inc"
345 #include "rcuda_md4.inc"
346 #include "rcuda_ntlm.inc"
347 #include "rcuda_sha1.inc"
348 #include "rcuda_lm.inc"
349 
CalcChainsOnCUDA(const rcuda::RCudaTask * task,uint64_t * resultBuff)350 extern "C" int CalcChainsOnCUDA(const rcuda::RCudaTask* task, uint64_t *resultBuff) {
351 	cudaSetDeviceFlags(cudaDeviceBlockingSync);
352 
353 	cudaError_t cuErr;
354 	uint64_t *data;
355 	int *cpPos;
356 	uint64_t uiVal64;
357 
358 	if(task->charSetSize > CHAR_SET_MAXLEN)
359 		return -1;
360 
361 	switch(task->hash) {
362 	case rcuda::RHASH_MD5:
363 	case rcuda::RHASH_MD4:
364 	case rcuda::RHASH_NTLM:
365 	case rcuda::RHASH_SHA1:
366 	case rcuda::RHASH_MYSQLSHA1:
367 	case rcuda::RHASH_LM:
368 		break;
369 	default:
370 		return 0;
371 	}
372 
373 	cudaMalloc((void**)&data, (task->idxCount+BLOCK_X_SIZE)*2*sizeof(uint64_t));
374 	cudaMalloc((void**)&cpPos, task->cpPosSize*sizeof(int));
375 
376 	cudaMemcpy(data, resultBuff, task->idxCount*2*sizeof(uint64_t), cudaMemcpyHostToDevice);
377 	cudaMemcpy(cpPos, task->cpPositions, task->cpPosSize*sizeof(int), cudaMemcpyHostToDevice);
378 
379 	cudaMemcpyToSymbol(dataHeap, &data, sizeof(data));
380 	cudaMemcpyToSymbol(cplDimVec, task->dimVec, min((unsigned long long)task->dimVecSize*sizeof(uint2), (unsigned long long)sizeof(cplDimVec)));
381 	cudaMemcpyToSymbol(cplChrSet, task->charSet, min(task->charSetSize, CHAR_SET_MAXLEN));
382 	cudaMemcpyToSymbol(plCpPos, &cpPos, sizeof(cpPos));
383 	cudaMemcpyToSymbol(plCpPosSize, &task->cpPosSize, sizeof(task->cpPosSize));
384 	cudaMemcpyToSymbol(reduceOffset, &task->reduceOffset, sizeof(task->reduceOffset));
385 	cudaMemcpyToSymbol(plainSpaceTotal, &task->plainSpaceTotal, sizeof(task->plainSpaceTotal));
386 	uiVal64 = (uint64_t)-1/task->plainSpaceTotal;
387 	cudaMemcpyToSymbol(rPlainSpaceTotal, &uiVal64, sizeof(uiVal64));
388 
389 	int grSizeX = (task->idxCount-1)/BLOCK_X_SIZE + 1;
390 	int grSizeY = (grSizeX-1)/GRID_X_SIZE + 1;
391 	grSizeX = GRID_X_SIZE;
392 	dim3 numBlocks(grSizeX, grSizeY);
393 	cuErr = cudaSuccess;
394 
395 	for(unsigned int idx = 0; idx < task->rainbowChainLen-1 && cuErr == cudaSuccess; idx+=task->kernChainSize) {
396 		switch(task->hash) {
397 		case rcuda::RHASH_MD5:
398 			RTGenMD5Kernel<<<numBlocks, BLOCK_X_SIZE>>>(idx, min(idx+task->kernChainSize, task->rainbowChainLen-1));
399 			break;
400 		case rcuda::RHASH_MD4:
401 			RTGenMD4Kernel<<<numBlocks, BLOCK_X_SIZE>>>(idx, min(idx+task->kernChainSize, task->rainbowChainLen-1));
402 			break;
403 		case rcuda::RHASH_NTLM:
404 			RTGenNTLMKernel<<<numBlocks, BLOCK_X_SIZE>>>(idx, min(idx+task->kernChainSize, task->rainbowChainLen-1));
405 			break;
406 		case rcuda::RHASH_SHA1:
407 			RTGenSHA1Kernel<<<numBlocks, BLOCK_X_SIZE>>>(idx, min(idx+task->kernChainSize, task->rainbowChainLen-1));
408 			break;
409 		case rcuda::RHASH_MYSQLSHA1:
410 			RTGenMySQLSHA1Kernel<<<numBlocks, BLOCK_X_SIZE>>>(idx, min(idx+task->kernChainSize, task->rainbowChainLen-1));
411 			break;
412 		case rcuda::RHASH_LM:
413 			RTGenLMKernel<<<numBlocks, BLOCK_X_SIZE>>>(idx, min(idx+task->kernChainSize, task->rainbowChainLen-1));
414 			break;
415 		case rcuda::RHASH_UNDEF:
416 			break;
417 		}
418 
419 		cuErr = cudaThreadSynchronize();
420 	}
421 
422 	if(cuErr == cudaSuccess)
423 		cudaMemcpy(resultBuff, data, task->idxCount*2*sizeof(uint64_t), cudaMemcpyDeviceToHost);
424 	else
425 		fprintf(stderr, "Error happened: %d (%s)\n", cuErr, cudaGetErrorString(cuErr));
426 
427 	if(task->cpPosSize > 0) {
428 		cudaFree(cpPos);
429 	}
430 	cudaFree(data);
431 	return cuErr==cudaSuccess? task->idxCount : -1;
432 }
433 
PreCalculateOnCUDA(const rcuda::RCudaTask * task,uint64_t * resultBuff)434 extern "C" int PreCalculateOnCUDA(const rcuda::RCudaTask* task, uint64_t *resultBuff) {
435 	cudaSetDeviceFlags(cudaDeviceBlockingSync);
436 
437 	cudaError_t cuErr;
438 	uint64_t *data;
439 	uint64_t uiVal64;
440 
441 	if(task->charSetSize > CHAR_SET_MAXLEN)
442 		return -1;
443 
444 	switch(task->hash) {
445 	case rcuda::RHASH_MD5:
446 	case rcuda::RHASH_MD4:
447 	case rcuda::RHASH_NTLM:
448 	case rcuda::RHASH_SHA1:
449 	case rcuda::RHASH_MYSQLSHA1:
450 	case rcuda::RHASH_LM:
451 		break;
452 	default:
453 		return 0;
454 	}
455 
456 	cudaMalloc((void**)&data, (task->idxCount+BLOCK_X_SIZE)*sizeof(uint64_t));
457 	cudaMemcpy(data, resultBuff, task->idxCount*sizeof(uint64_t), cudaMemcpyHostToDevice);
458 
459 	cudaMemcpyToSymbol(dataHeap, &data, sizeof(data));
460 	cudaMemcpyToSymbol(cplDimVec, task->dimVec, min((unsigned long long)task->dimVecSize*sizeof(uint2),(unsigned long long)sizeof(cplDimVec)));
461 	cudaMemcpyToSymbol(cplChrSet, task->charSet, min(task->charSetSize, CHAR_SET_MAXLEN));
462 	cudaMemcpyToSymbol(reduceOffset, &task->reduceOffset, sizeof(task->reduceOffset));
463 	cudaMemcpyToSymbol(plainSpaceTotal, &task->plainSpaceTotal, sizeof(task->plainSpaceTotal));
464 	uiVal64 = (uint64_t)-1/task->plainSpaceTotal;
465 	cudaMemcpyToSymbol(rPlainSpaceTotal, &uiVal64, sizeof(uiVal64));
466 	cudaMemcpyToSymbol(cplStartIdx, &task->startIdx, sizeof(task->startIdx));
467 
468 	int grSizeX = (task->idxCount-1)/BLOCK_X_SIZE + 1;
469 	int grSizeY = (grSizeX-1)/GRID_X_SIZE + 1;
470 	grSizeX = GRID_X_SIZE;
471 	dim3 numBlocks(grSizeX, grSizeY);
472 	cuErr = cudaSuccess;
473 
474 	for(unsigned int idx = (unsigned int)task->startIdx; idx < task->rainbowChainLen-1 && cuErr == cudaSuccess; idx+=task->kernChainSize) {
475 		switch(task->hash) {
476 		case rcuda::RHASH_MD5:
477 			RTPreCalcMD5Kernel<<<numBlocks, BLOCK_X_SIZE>>>(idx, min(idx+task->kernChainSize, task->rainbowChainLen-1));
478 			break;
479 		case rcuda::RHASH_MD4:
480 			RTPreCalcMD4Kernel<<<numBlocks, BLOCK_X_SIZE>>>(idx, min(idx+task->kernChainSize, task->rainbowChainLen-1));
481 			break;
482 		case rcuda::RHASH_NTLM:
483 			RTPreCalcNTLMKernel<<<numBlocks, BLOCK_X_SIZE>>>(idx, min(idx+task->kernChainSize, task->rainbowChainLen-1));
484 			break;
485 		case rcuda::RHASH_SHA1:
486 			RTPreCalcSHA1Kernel<<<numBlocks, BLOCK_X_SIZE>>>(idx, min(idx+task->kernChainSize, task->rainbowChainLen-1));
487 			break;
488 		case rcuda::RHASH_MYSQLSHA1:
489 			RTPreCalcMySQLSHA1Kernel<<<numBlocks, BLOCK_X_SIZE>>>(idx, min(idx+task->kernChainSize, task->rainbowChainLen-1));
490 			break;
491 		case rcuda::RHASH_LM:
492 			RTPreCalcKernel<<<numBlocks, BLOCK_X_SIZE>>>(idx, min(idx+task->kernChainSize, task->rainbowChainLen-1));
493 			break;
494 		case rcuda::RHASH_UNDEF:
495 			break;
496 		}
497 
498 		cuErr = cudaThreadSynchronize();
499 	}
500 
501 	if(cuErr == cudaSuccess)
502 		cudaMemcpy(resultBuff, data, task->idxCount*sizeof(uint64_t), cudaMemcpyDeviceToHost);
503 	else
504 		fprintf(stderr, "Error happened: %d (%s)\n", cuErr, cudaGetErrorString(cuErr));
505 
506 	cudaFree(data);
507 	return cuErr==cudaSuccess? task->idxCount : -1;
508 }
509 
CheckAlarmOnCUDA(const rcuda::RCudaTask * task,uint64_t * resultBuff)510 extern "C" int CheckAlarmOnCUDA(const rcuda::RCudaTask* task, uint64_t *resultBuff) {
511 	cudaSetDeviceFlags(cudaDeviceBlockingSync);
512 
513 	cudaError_t cuErr;
514 	uint64_t *data;
515 	uint64_t uiVal64;
516 	int isize;
517 
518 	if(task->charSetSize > CHAR_SET_MAXLEN)
519 		return -1;
520 
521 	isize = 0;
522 	switch(task->hash) {
523 	case rcuda::RHASH_MD5:
524 	case rcuda::RHASH_MD4:
525 	case rcuda::RHASH_NTLM:
526 		isize = (isize==0? 16 : isize);
527 	case rcuda::RHASH_SHA1:
528 	case rcuda::RHASH_MYSQLSHA1:
529 		isize = (isize==0? 20 : isize);
530 	case rcuda::RHASH_LM:
531 		isize = (isize==0? 8 : isize);
532 		cudaMemcpyToSymbol(cplTargetHash, task->targetHash, isize);
533 		isize = (isize>>2);
534 		cudaMemcpyToSymbol(cplTargetHashSize, &isize, sizeof(isize));
535 		break;
536 	default:
537 		return 0;
538 	}
539 
540 	cudaMalloc((void**)&data, (task->idxCount+BLOCK_X_SIZE)*2*sizeof(uint64_t));
541 	cudaMemcpy(data, resultBuff, task->idxCount*2*sizeof(uint64_t), cudaMemcpyHostToDevice);
542 
543 	cudaMemcpyToSymbol(dataHeap, &data, sizeof(data));
544 	cudaMemcpyToSymbol(cplDimVec, task->dimVec, min((unsigned long long)task->dimVecSize*sizeof(uint2), (unsigned long long)sizeof(cplDimVec)));
545 	cudaMemcpyToSymbol(cplChrSet, task->charSet, min(task->charSetSize, CHAR_SET_MAXLEN));
546 	cudaMemcpyToSymbol(reduceOffset, &task->reduceOffset, sizeof(task->reduceOffset));
547 	cudaMemcpyToSymbol(plainSpaceTotal, &task->plainSpaceTotal, sizeof(task->plainSpaceTotal));
548 	uiVal64 = (uint64_t)-1/task->plainSpaceTotal;
549 	cudaMemcpyToSymbol(rPlainSpaceTotal, &uiVal64, sizeof(uiVal64));
550 
551 	int grSizeX = (task->idxCount-1)/BLOCK_X_SIZE + 1;
552 	int grSizeY = (grSizeX-1)/GRID_X_SIZE + 1;
553 	grSizeX = GRID_X_SIZE;
554 	dim3 numBlocks(grSizeX, grSizeY);
555 	cuErr = cudaSuccess;
556 
557 	isize = 0;
558 	cudaMemcpyToSymbol(termKernel, &isize, sizeof(isize));
559 	for(unsigned int idx = 0; idx <= task->rainbowChainLen && cuErr == cudaSuccess; idx+=task->kernChainSize) {
560 		switch(task->hash) {
561 		case rcuda::RHASH_MD5:
562 			RTCheckAlarmMD5Kernel<<<numBlocks, BLOCK_X_SIZE>>>(idx, min(idx+task->kernChainSize-1, task->rainbowChainLen));
563 			break;
564 		case rcuda::RHASH_MD4:
565 			RTCheckAlarmMD4Kernel<<<numBlocks, BLOCK_X_SIZE>>>(idx, min(idx+task->kernChainSize-1, task->rainbowChainLen));
566 			break;
567 		case rcuda::RHASH_NTLM:
568 			RTCheckAlarmNTLMKernel<<<numBlocks, BLOCK_X_SIZE>>>(idx, min(idx+task->kernChainSize-1, task->rainbowChainLen));
569 			break;
570 		case rcuda::RHASH_SHA1:
571 			RTCheckAlarmSHA1Kernel<<<numBlocks, BLOCK_X_SIZE>>>(idx, min(idx+task->kernChainSize-1, task->rainbowChainLen));
572 			break;
573 		case rcuda::RHASH_MYSQLSHA1:
574 			RTCheckAlarmMySQLSHA1Kernel<<<numBlocks, BLOCK_X_SIZE>>>(idx, min(idx+task->kernChainSize-1, task->rainbowChainLen));
575 			break;
576 		case rcuda::RHASH_LM:
577 			RTCheckAlarmLMKernel<<<numBlocks, BLOCK_X_SIZE>>>(idx, min(idx+task->kernChainSize-1, task->rainbowChainLen));
578 			break;
579 		case rcuda::RHASH_UNDEF:
580 			break;
581 		}
582 
583 		cuErr = cudaThreadSynchronize();
584 		cudaMemcpyFromSymbol(&isize, termKernel, sizeof(isize));
585 		if(isize)
586 			break;
587 	}
588 
589 	if(cuErr == cudaSuccess)
590 		cudaMemcpy(resultBuff, data, task->idxCount*2*sizeof(uint64_t), cudaMemcpyDeviceToHost);
591 	else
592 		fprintf(stderr, "Error happened: %d (%s)\n", cuErr, cudaGetErrorString(cuErr));
593 
594 	cudaFree(data);
595 	return cuErr==cudaSuccess? task->idxCount : -1;
596 }
597 
GetChainsBufferSize(int minSize)598 extern "C" int GetChainsBufferSize(int minSize) {
599 	int grSizeX = (minSize-1)/BLOCK_X_SIZE + 1;
600 	int grSizeY = (grSizeX-1)/GRID_X_SIZE + 1;
601 	grSizeX = GRID_X_SIZE;
602 	return grSizeX*grSizeY*BLOCK_X_SIZE;
603 }
604 
SetCudaDevice(int device)605 extern "C" int SetCudaDevice(int device) {
606 	return cudaSetDevice(device);
607 }
608