1 //============================================================================
5 // Description : Generator of FreeRainbowTables / MD5, MD4, NTLM, SHA1, LM
6 //============================================================================
15 #define GRID_X_SIZE (1<<GRID_X_L2)
16 #define GRID_Y_SIZE (1<<GRID_Y_L2)
17 #define BLOCK_X_SIZE (1<<BLOCK_X_L2)
18 #define PLAIN_MAX_SIZE 20
19 #define KERN_CHAIN_SIZE 100
20 #define CHAR_SET_MAXLEN 512
21 #define SHIDX(x) ((x)<<4)
24 __device__ uint64 *dataHeap;
25 __device__ unsigned char *plStart;
26 __device__ uint3 *plDimVec;
27 __device__ unsigned char *plChrSet;
28 __device__ int *plCpPos;
29 __device__ int plCpPosSize;
30 __device__ uint64 reduceOffset;
31 __device__ uint64 plainSpaceTotal;
32 __device__ uint64 rPlainSpaceTotal;
34 #define RTGEN_PROLOGUE \
35 unsigned int *hData; \
38 unsigned int nPos, ii; \
39 unsigned int cpcheck, checkpoint; \
41 __shared__ unsigned int shData[SHIDX(BLOCK_X_SIZE)]; \
42 __shared__ unsigned char cplChrSet[CHAR_SET_MAXLEN]; \
43 __shared__ unsigned char cplStart[PLAIN_MAX_SIZE]; \
44 __shared__ uint3 cplDimVec[PLAIN_MAX_SIZE]; \
46 if(threadIdx.x == 0) { \
47 nPos = ((((blockIdx.y<<GRID_X_L2) + blockIdx.x)<<BLOCK_X_L2) + threadIdx.x) << 1; \
48 for(ii = 0; ii < BLOCK_X_SIZE; ii++, nPos+=2) { \
49 hData = shData + ((ii>>4)<<8)+(ii&15); \
50 hData[SHIDX(0)] = dataHeap[nPos]; \
51 hData[SHIDX(1)] = dataHeap[nPos]>>32; \
52 hData[SHIDX(2)] = dataHeap[nPos+1]; \
54 memcpy(cplChrSet, plChrSet, CHAR_SET_MAXLEN); \
55 memcpy(cplStart, plStart, PLAIN_MAX_SIZE); \
56 memcpy(cplDimVec, plDimVec, PLAIN_MAX_SIZE*sizeof(uint3)); \
60 hData = shData + ((threadIdx.x>>4)<<8)+(threadIdx.x&15); \
62 idx64 = hData[SHIDX(1)]; \
63 idx64 = (idx64<<32) | hData[SHIDX(0)]; \
64 cpcheck = hData[SHIDX(2)]; \
65 checkpoint = cpcheck&0x0000ffff; \
66 cpcheck = cpcheck>>16; \
68 for(nPos = chainStart; nPos < chainStop; nPos++) {
71 #define RTGEN_EPILOGUE \
72 idx64 = hData[SHIDX(1)]; \
73 idx64 = (idx64<<32) | hData[SHIDX(0)]; \
74 idx64 += reduceOffset + nPos; \
75 uiDiv64 = __umul64hi(idx64, rPlainSpaceTotal); \
76 idx64 -= uiDiv64*plainSpaceTotal; \
77 if(idx64 >= plainSpaceTotal) \
78 idx64 -= plainSpaceTotal; \
80 if(cpcheck < plCpPosSize && nPos == plCpPos[cpcheck]) { \
81 checkpoint |= ((unsigned int)idx64&1) << cpcheck; \
86 hData[SHIDX(0)] = idx64; \
87 hData[SHIDX(1)] = idx64>>32; \
88 hData[SHIDX(2)] = (cpcheck<<16)|(checkpoint&0x0000ffff); \
91 if(threadIdx.x == 0) { \
92 nPos = ((((blockIdx.y<<GRID_X_L2) + blockIdx.x)<<BLOCK_X_L2) + threadIdx.x) << 1; \
93 for(ii = 0; ii < BLOCK_X_SIZE; ii++, nPos+=2) { \
94 hData = shData + ((ii>>4)<<8)+(ii&15); \
95 dataHeap[nPos] = ((uint64)hData[SHIDX(1)]<<32)|(uint64)hData[SHIDX(0)]; \
96 dataHeap[nPos+1] = hData[SHIDX(2)]; \
101 #include "rcuda_md5.inc"
102 #include "rcuda_md4.inc"
103 #include "rcuda_ntlm.inc"
104 #include "rcuda_sha1.inc"
105 #include "rcuda_lm.inc"
107 extern "C" int CalcChainsOnCUDA(const rcuda::RCudaTask* task, uint64 *resultBuff) {
109 char buff[PLAIN_MAX_SIZE];
111 unsigned char *stPlain;
113 unsigned char *charSet;
117 if(task->charSetSize > CHAR_SET_MAXLEN)
121 case rcuda::RHASH_MD5:
122 case rcuda::RHASH_MD4:
123 case rcuda::RHASH_NTLM:
124 case rcuda::RHASH_SHA1:
125 case rcuda::RHASH_LM:
131 memset(buff, 0, PLAIN_MAX_SIZE);
132 cudaMalloc((void**)&data, task->idxCount*2*sizeof(uint64));
133 cudaMalloc((void**)&stPlain, PLAIN_MAX_SIZE);
134 cudaMalloc((void**)&dimVec, max(task->dimVecSize, PLAIN_MAX_SIZE)*sizeof(uint3));
135 cudaMalloc((void**)&charSet, CHAR_SET_MAXLEN);
136 cudaMalloc((void**)&cpPos, task->cpPosSize*sizeof(int));
138 cudaMemcpy(data, resultBuff, task->idxCount*2*sizeof(uint64), cudaMemcpyHostToDevice);
139 cudaMemcpy(stPlain, buff, PLAIN_MAX_SIZE, cudaMemcpyHostToDevice);
140 cudaMemcpy(stPlain, task->stPlain, min(task->stPlainSize, PLAIN_MAX_SIZE), cudaMemcpyHostToDevice);
141 cudaMemcpy(dimVec, task->dimVec, min(task->dimVecSize, PLAIN_MAX_SIZE)*sizeof(uint3), cudaMemcpyHostToDevice);
142 cudaMemcpy(charSet, task->charSet, min(task->charSetSize, CHAR_SET_MAXLEN), cudaMemcpyHostToDevice);
143 cudaMemcpy(cpPos, task->cpPositions, task->cpPosSize*sizeof(int), cudaMemcpyHostToDevice);
145 cudaMemcpyToSymbol(dataHeap, &data, sizeof(data));
146 cudaMemcpyToSymbol(plStart, &stPlain, sizeof(stPlain));
147 cudaMemcpyToSymbol(plDimVec, &dimVec, sizeof(dimVec));
148 cudaMemcpyToSymbol(plChrSet, &charSet, sizeof(charSet));
149 cudaMemcpyToSymbol(plCpPos, &cpPos, sizeof(cpPos));
150 cudaMemcpyToSymbol(plCpPosSize, &task->cpPosSize, sizeof(task->cpPosSize));
151 cudaMemcpyToSymbol(reduceOffset, &task->reduceOffset, sizeof(task->reduceOffset));
152 cudaMemcpyToSymbol(plainSpaceTotal, &task->plainSpaceTotal, sizeof(task->plainSpaceTotal));
153 uiVal64 = (uint64)-1/task->plainSpaceTotal;
154 cudaMemcpyToSymbol(rPlainSpaceTotal, &uiVal64, sizeof(uiVal64));
156 int grSizeX = (task->idxCount-1)/BLOCK_X_SIZE + 1;
157 int grSizeY = (grSizeX-1)/GRID_X_SIZE + 1;
158 grSizeX = GRID_X_SIZE;
159 dim3 numBlocks(grSizeX, grSizeY);
162 cudaSetDeviceFlags(cudaDeviceBlockingSync);
164 for(int idx = 0; idx < task->rainbowChainLen-1 && cuErr == cudaSuccess; idx+=KERN_CHAIN_SIZE) {
166 case rcuda::RHASH_MD5:
167 RTGenMD5Kernel<<<numBlocks, BLOCK_X_SIZE>>>(idx, min(idx+KERN_CHAIN_SIZE, task->rainbowChainLen-1));
169 case rcuda::RHASH_MD4:
170 RTGenMD4Kernel<<<numBlocks, BLOCK_X_SIZE>>>(idx, min(idx+KERN_CHAIN_SIZE, task->rainbowChainLen-1));
172 case rcuda::RHASH_NTLM:
173 RTGenNTLMKernel<<<numBlocks, BLOCK_X_SIZE>>>(idx, min(idx+KERN_CHAIN_SIZE, task->rainbowChainLen-1));
175 case rcuda::RHASH_SHA1:
176 RTGenSHA1Kernel<<<numBlocks, BLOCK_X_SIZE>>>(idx, min(idx+KERN_CHAIN_SIZE, task->rainbowChainLen-1));
178 case rcuda::RHASH_LM:
179 RTGenLMKernel<<<numBlocks, BLOCK_X_SIZE>>>(idx, min(idx+KERN_CHAIN_SIZE, task->rainbowChainLen-1));
182 cuErr = cudaGetLastError();
183 if(cuErr == cudaSuccess)
184 cuErr = cudaThreadSynchronize();
187 if(cuErr == cudaSuccess)
188 cudaMemcpy(resultBuff, data, task->idxCount*2*sizeof(uint64), cudaMemcpyDeviceToHost);
190 fprintf(stderr, "Error happened: %d (%s)\n", cuErr, cudaGetErrorString(cuErr));
197 return cuErr==cudaSuccess? task->idxCount : -1;
200 extern "C" int GetChainsBufferSize(int minSize) {
201 int grSizeX = (minSize-1)/BLOCK_X_SIZE + 1;
202 int grSizeY = (grSizeX-1)/GRID_X_SIZE + 1;
203 grSizeX = GRID_X_SIZE;
204 return grSizeX*grSizeY*BLOCK_X_SIZE;
207 extern "C" int SetCudaDevice(int device) {
208 return cudaSetDevice(device);