]> git.sesse.net Git - freerainbowtables/blob - BOINC software/BOINC client apps/distrrtgen_cuda/rcuda.cu
(C)
[freerainbowtables] / BOINC software / BOINC client apps / distrrtgen_cuda / rcuda.cu
1 //============================================================================
2 // Name        : rcuda.cu
3 // Author      : Jan Kyska
4 // Version     : 1.00
5 // Description : Generator of FreeRainbowTables / MD5, MD4, NTLM, SHA1, LM
6 //============================================================================ 
7
8 #include <stdio.h>  
9 #include <cuda.h>  
10 #include "rcuda.h"
11
12 #define   GRID_X_L2     6
13 #define   GRID_Y_L2     6
14 #define   BLOCK_X_L2    6
15 #define   GRID_X_SIZE   (1<<GRID_X_L2)
16 #define   GRID_Y_SIZE   (1<<GRID_Y_L2)
17 #define   BLOCK_X_SIZE  (1<<BLOCK_X_L2)
18 #define   PLAIN_MAX_SIZE     20
19 #define   KERN_CHAIN_SIZE   100
20 #define   CHAR_SET_MAXLEN   512
21 #define   SHIDX(x)      ((x)<<4)
22
23
24 __device__ uint64 *dataHeap;
25 __device__ unsigned char *plStart;
26 __device__ uint3 *plDimVec;
27 __device__ unsigned char *plChrSet;
28 __device__ int *plCpPos;
29 __device__ int plCpPosSize;
30 __device__ uint64 reduceOffset;
31 __device__ uint64 plainSpaceTotal;
32 __device__ uint64 rPlainSpaceTotal;
33
34 #define RTGEN_PROLOGUE  \
35         unsigned int *hData;  \
36         unsigned int idx;  \
37         uint64 idx64;  \
38         unsigned int nPos, ii;  \
39         unsigned int cpcheck, checkpoint;  \
40         \
41         __shared__ unsigned int shData[SHIDX(BLOCK_X_SIZE)];  \
42         __shared__ unsigned char cplChrSet[CHAR_SET_MAXLEN];  \
43         __shared__ unsigned char cplStart[PLAIN_MAX_SIZE];  \
44         __shared__ uint3 cplDimVec[PLAIN_MAX_SIZE];  \
45         \
46         if(threadIdx.x == 0) {  \
47                 nPos = ((((blockIdx.y<<GRID_X_L2) + blockIdx.x)<<BLOCK_X_L2) + threadIdx.x) << 1;  \
48                 for(ii = 0; ii < BLOCK_X_SIZE; ii++, nPos+=2) {  \
49                         hData = shData + ((ii>>4)<<8)+(ii&15);  \
50                         hData[SHIDX(0)] = dataHeap[nPos];  \
51                         hData[SHIDX(1)] = dataHeap[nPos]>>32;  \
52                         hData[SHIDX(2)] = dataHeap[nPos+1];  \
53                 }  \
54                 memcpy(cplChrSet, plChrSet, CHAR_SET_MAXLEN);  \
55                 memcpy(cplStart, plStart, PLAIN_MAX_SIZE);  \
56                 memcpy(cplDimVec, plDimVec, PLAIN_MAX_SIZE*sizeof(uint3));  \
57         }  \
58         __syncthreads();  \
59         \
60         hData = shData + ((threadIdx.x>>4)<<8)+(threadIdx.x&15);  \
61         \
62         idx64 = hData[SHIDX(1)];  \
63         idx64 = (idx64<<32) | hData[SHIDX(0)];  \
64         cpcheck = hData[SHIDX(2)];  \
65         checkpoint = cpcheck&0x0000ffff;  \
66         cpcheck = cpcheck>>16;  \
67         \
68         for(nPos = chainStart; nPos < chainStop; nPos++) {
69
70
71 #define RTGEN_EPILOGUE  \
72                 idx64 = hData[SHIDX(1)];  \
73                 idx64 = (idx64<<32) | hData[SHIDX(0)];  \
74                 idx64 += reduceOffset + nPos;  \
75                 uiDiv64 = __umul64hi(idx64, rPlainSpaceTotal);  \
76                 idx64 -= uiDiv64*plainSpaceTotal;  \
77                 if(idx64 >= plainSpaceTotal)  \
78                         idx64 -= plainSpaceTotal;  \
79                 \
80                 if(cpcheck < plCpPosSize && nPos == plCpPos[cpcheck]) {  \
81                         checkpoint |= ((unsigned int)idx64&1) << cpcheck;  \
82                         cpcheck++;  \
83                 }  \
84         }  \
85         \
86         hData[SHIDX(0)] = idx64;  \
87         hData[SHIDX(1)] = idx64>>32;  \
88         hData[SHIDX(2)] = (cpcheck<<16)|(checkpoint&0x0000ffff);  \
89         __syncthreads();  \
90         \
91         if(threadIdx.x == 0) {  \
92                 nPos = ((((blockIdx.y<<GRID_X_L2) + blockIdx.x)<<BLOCK_X_L2) + threadIdx.x) << 1;  \
93                 for(ii = 0; ii < BLOCK_X_SIZE; ii++, nPos+=2) {  \
94                         hData = shData + ((ii>>4)<<8)+(ii&15);  \
95                         dataHeap[nPos] = ((uint64)hData[SHIDX(1)]<<32)|(uint64)hData[SHIDX(0)];  \
96                         dataHeap[nPos+1] = hData[SHIDX(2)];  \
97                 }  \
98         }
99
100
101 #include "rcuda_md5.inc"
102 #include "rcuda_md4.inc"
103 #include "rcuda_ntlm.inc"
104 #include "rcuda_sha1.inc"
105 #include "rcuda_lm.inc"
106
107 extern "C" int CalcChainsOnCUDA(const rcuda::RCudaTask* task, uint64 *resultBuff) {
108         cudaError_t cuErr;
109         char buff[PLAIN_MAX_SIZE];
110         uint64 *data;
111         unsigned char *stPlain;
112         uint3 *dimVec;
113         unsigned char *charSet;
114         int *cpPos;
115         uint64 uiVal64;
116
117         if(task->charSetSize > CHAR_SET_MAXLEN)
118                 return -1;
119
120         switch(task->hash) {
121         case rcuda::RHASH_MD5:
122         case rcuda::RHASH_MD4:
123         case rcuda::RHASH_NTLM:
124         case rcuda::RHASH_SHA1:
125         case rcuda::RHASH_LM:
126                 break;
127         default:        
128                 return 0;
129         }
130         
131         memset(buff, 0, PLAIN_MAX_SIZE);
132         cudaMalloc((void**)&data, task->idxCount*2*sizeof(uint64));
133         cudaMalloc((void**)&stPlain, PLAIN_MAX_SIZE);
134         cudaMalloc((void**)&dimVec, max(task->dimVecSize, PLAIN_MAX_SIZE)*sizeof(uint3));
135         cudaMalloc((void**)&charSet, CHAR_SET_MAXLEN);
136         cudaMalloc((void**)&cpPos, task->cpPosSize*sizeof(int));
137
138         cudaMemcpy(data, resultBuff, task->idxCount*2*sizeof(uint64), cudaMemcpyHostToDevice);
139         cudaMemcpy(stPlain, buff, PLAIN_MAX_SIZE, cudaMemcpyHostToDevice);
140         cudaMemcpy(stPlain, task->stPlain, min(task->stPlainSize, PLAIN_MAX_SIZE), cudaMemcpyHostToDevice);
141         cudaMemcpy(dimVec, task->dimVec, min(task->dimVecSize, PLAIN_MAX_SIZE)*sizeof(uint3), cudaMemcpyHostToDevice);
142         cudaMemcpy(charSet, task->charSet, min(task->charSetSize, CHAR_SET_MAXLEN), cudaMemcpyHostToDevice);
143         cudaMemcpy(cpPos, task->cpPositions, task->cpPosSize*sizeof(int), cudaMemcpyHostToDevice);
144
145         cudaMemcpyToSymbol(dataHeap, &data, sizeof(data));
146         cudaMemcpyToSymbol(plStart, &stPlain, sizeof(stPlain));
147         cudaMemcpyToSymbol(plDimVec, &dimVec, sizeof(dimVec));
148         cudaMemcpyToSymbol(plChrSet, &charSet, sizeof(charSet));
149         cudaMemcpyToSymbol(plCpPos, &cpPos, sizeof(cpPos));
150         cudaMemcpyToSymbol(plCpPosSize, &task->cpPosSize, sizeof(task->cpPosSize));
151         cudaMemcpyToSymbol(reduceOffset, &task->reduceOffset, sizeof(task->reduceOffset));
152         cudaMemcpyToSymbol(plainSpaceTotal, &task->plainSpaceTotal, sizeof(task->plainSpaceTotal));
153         uiVal64 = (uint64)-1/task->plainSpaceTotal;
154         cudaMemcpyToSymbol(rPlainSpaceTotal, &uiVal64, sizeof(uiVal64));
155
156         int grSizeX = (task->idxCount-1)/BLOCK_X_SIZE + 1;
157         int grSizeY = (grSizeX-1)/GRID_X_SIZE + 1;
158         grSizeX = GRID_X_SIZE;
159         dim3 numBlocks(grSizeX, grSizeY);
160         cuErr = cudaSuccess;
161         
162         cudaSetDeviceFlags(cudaDeviceBlockingSync);
163
164         for(int idx = 0; idx < task->rainbowChainLen-1 && cuErr == cudaSuccess; idx+=KERN_CHAIN_SIZE) {
165                 switch(task->hash) {
166                 case rcuda::RHASH_MD5:
167                         RTGenMD5Kernel<<<numBlocks, BLOCK_X_SIZE>>>(idx, min(idx+KERN_CHAIN_SIZE, task->rainbowChainLen-1));
168                         break;
169                 case rcuda::RHASH_MD4:
170                         RTGenMD4Kernel<<<numBlocks, BLOCK_X_SIZE>>>(idx, min(idx+KERN_CHAIN_SIZE, task->rainbowChainLen-1));
171                         break;
172                 case rcuda::RHASH_NTLM:
173                         RTGenNTLMKernel<<<numBlocks, BLOCK_X_SIZE>>>(idx, min(idx+KERN_CHAIN_SIZE, task->rainbowChainLen-1));
174                         break;
175                 case rcuda::RHASH_SHA1:
176                         RTGenSHA1Kernel<<<numBlocks, BLOCK_X_SIZE>>>(idx, min(idx+KERN_CHAIN_SIZE, task->rainbowChainLen-1));
177                         break;
178                 case rcuda::RHASH_LM:
179                         RTGenLMKernel<<<numBlocks, BLOCK_X_SIZE>>>(idx, min(idx+KERN_CHAIN_SIZE, task->rainbowChainLen-1));
180                         break;
181                 }
182                 cuErr = cudaGetLastError();
183                 if(cuErr == cudaSuccess)
184                         cuErr = cudaThreadSynchronize();
185         }
186
187         if(cuErr == cudaSuccess)
188                 cudaMemcpy(resultBuff, data, task->idxCount*2*sizeof(uint64), cudaMemcpyDeviceToHost);
189         else
190                 fprintf(stderr, "Error happened: %d (%s)\n", cuErr, cudaGetErrorString(cuErr)); 
191
192         cudaFree(cpPos);
193         cudaFree(charSet);
194         cudaFree(dimVec);
195         cudaFree(stPlain);
196         cudaFree(data);
197         return cuErr==cudaSuccess? task->idxCount : -1;
198 }
199
200 extern "C" int GetChainsBufferSize(int minSize) {
201         int grSizeX = (minSize-1)/BLOCK_X_SIZE + 1;
202         int grSizeY = (grSizeX-1)/GRID_X_SIZE + 1;
203         grSizeX = GRID_X_SIZE;
204         return grSizeX*grSizeY*BLOCK_X_SIZE;
205 }
206
207 extern "C" int SetCudaDevice(int device) {
208         return cudaSetDevice(device);
209 }