]> git.sesse.net Git - freerainbowtables/blob - BOINC software/BOINC client apps/distrrtgen_cuda/rcuda.cu
b626041ede05721baf29f545226e9edcb5c9a524
[freerainbowtables] / BOINC software / BOINC client apps / distrrtgen_cuda / rcuda.cu
1 //============================================================================
2 // Name        : rcuda.cu
3 // Author      : Jan Kyska
4 // Version     : 1.00
5 // Description : Generator of FreeRainbowTables / MD5, MD4, NTLM, SHA1, LM
6 //============================================================================ 
7
8 #include <stdio.h>  
9 #include <cuda.h>  
10 #include "rcuda.h"
11
12 #define   GRID_X_L2     6
13 #define   GRID_Y_L2     6
14 #define   BLOCK_X_L2    6
15 #define   GRID_X_SIZE   (1<<GRID_X_L2)
16 #define   GRID_Y_SIZE   (1<<GRID_Y_L2)
17 #define   BLOCK_X_SIZE  (1<<BLOCK_X_L2)
18 #define   PLAIN_MAX_SIZE     20
19 #define   KERN_CHAIN_SIZE   100
20 #define   CHAR_SET_MAXLEN   512
21 #define   SHIDX(x)      ((x)<<4)
22
23
24 __device__ uint64 *dataHeap;
25 __device__ unsigned char *plStart;
26 __device__ uint3 *plDimVec;
27 __device__ unsigned char *plChrSet;
28 __device__ int *plCpPos;
29 __device__ int plCpPosSize;
30 __device__ uint64 reduceOffset;
31 __device__ uint64 plainSpaceTotal;
32 __device__ uint64 rPlainSpaceTotal;
33
34 #define RTGEN_PROLOGUE  \
35         unsigned int *hData;  \
36         unsigned int idx;  \
37         uint64 idx64;  \
38         unsigned int nPos, ii;  \
39         unsigned int cpcheck, checkpoint;  \
40         \
41         __shared__ unsigned int shData[SHIDX(BLOCK_X_SIZE)];  \
42         __shared__ unsigned char cplChrSet[CHAR_SET_MAXLEN];  \
43         __shared__ unsigned char cplStart[PLAIN_MAX_SIZE];  \
44         __shared__ uint3 cplDimVec[PLAIN_MAX_SIZE];  \
45         \
46         if(threadIdx.x == 0) {  \
47                 nPos = ((((blockIdx.y<<GRID_X_L2) + blockIdx.x)<<BLOCK_X_L2) + threadIdx.x) << 1;  \
48                 for(ii = 0; ii < BLOCK_X_SIZE; ii++, nPos+=2) {  \
49                         hData = shData + ((ii>>4)<<8)+(ii&15);  \
50                         hData[SHIDX(0)] = dataHeap[nPos];  \
51                         hData[SHIDX(1)] = dataHeap[nPos]>>32;  \
52                         hData[SHIDX(2)] = dataHeap[nPos+1];  \
53                 }  \
54                 memcpy(cplChrSet, plChrSet, CHAR_SET_MAXLEN);  \
55                 memcpy(cplStart, plStart, PLAIN_MAX_SIZE);  \
56                 memcpy(cplDimVec, plDimVec, PLAIN_MAX_SIZE*sizeof(uint3));  \
57         }  \
58         __syncthreads();  \
59         \
60         hData = shData + ((threadIdx.x>>4)<<8)+(threadIdx.x&15);  \
61         \
62         idx64 = hData[SHIDX(1)];  \
63         idx64 = (idx64<<32) | hData[SHIDX(0)];  \
64         cpcheck = hData[SHIDX(2)];  \
65         checkpoint = cpcheck&0x0000ffff;  \
66         cpcheck = cpcheck>>16;  \
67         \
68         for(nPos = chainStart; nPos < chainStop; nPos++) {
69
70
71 #define RTGEN_EPILOGUE  \
72                 idx64 = hData[SHIDX(1)];  \
73                 idx64 = (idx64<<32) | hData[SHIDX(0)];  \
74                 idx64 += reduceOffset + nPos;  \
75                 uiDiv64 = __umul64hi(idx64, rPlainSpaceTotal);  \
76                 idx64 -= uiDiv64*plainSpaceTotal;  \
77                 if(idx64 >= plainSpaceTotal)  \
78                         idx64 -= plainSpaceTotal;  \
79                 \
80                 if(cpcheck < plCpPosSize && nPos == plCpPos[cpcheck]) {  \
81                         checkpoint |= ((unsigned int)idx64&1) << cpcheck;  \
82                         cpcheck++;  \
83                 }  \
84         }  \
85         \
86         hData[SHIDX(0)] = idx64;  \
87         hData[SHIDX(1)] = idx64>>32;  \
88         hData[SHIDX(2)] = (cpcheck<<16)|(checkpoint&0x0000ffff);  \
89         __syncthreads();  \
90         \
91         if(threadIdx.x == 0) {  \
92                 nPos = ((((blockIdx.y<<GRID_X_L2) + blockIdx.x)<<BLOCK_X_L2) + threadIdx.x) << 1;  \
93                 for(ii = 0; ii < BLOCK_X_SIZE; ii++, nPos+=2) {  \
94                         hData = shData + ((ii>>4)<<8)+(ii&15);  \
95                         dataHeap[nPos] = ((uint64)hData[SHIDX(1)]<<32)|(uint64)hData[SHIDX(0)];  \
96                         dataHeap[nPos+1] = hData[SHIDX(2)];  \
97                 }  \
98         }
99
100
101 #include "rcuda_md5.inc"
102 #include "rcuda_md4.inc"
103 #include "rcuda_ntlm.inc"
104 #include "rcuda_sha1.inc"
105 #include "rcuda_lm.inc"
106
107 extern "C" int CalcChainsOnCUDA(const rcuda::RCudaTask* task, uint64 *resultBuff) {
108         cudaError_t cuErr;
109         char buff[PLAIN_MAX_SIZE];
110         uint64 *data;
111         unsigned char *stPlain;
112         uint3 *dimVec;
113         unsigned char *charSet;
114         int *cpPos;
115         uint64 uiVal64;
116
117         if(task->charSetSize > CHAR_SET_MAXLEN)
118                 return -1;
119
120         switch(task->hash) {
121         case rcuda::RHASH_MD5:
122         case rcuda::RHASH_MD4:
123         case rcuda::RHASH_NTLM:
124         case rcuda::RHASH_SHA1:
125         case rcuda::RHASH_LM:
126                 break;
127         default:        
128                 return 0;
129         }
130         
131         memset(buff, 0, PLAIN_MAX_SIZE);
132         cudaMalloc((void**)&data, task->idxCount*2*sizeof(uint64));
133         cudaMalloc((void**)&stPlain, PLAIN_MAX_SIZE);
134         cudaMalloc((void**)&dimVec, max(task->dimVecSize, PLAIN_MAX_SIZE)*sizeof(uint3));
135         cudaMalloc((void**)&charSet, CHAR_SET_MAXLEN);
136         cudaMalloc((void**)&cpPos, task->cpPosSize*sizeof(int));
137
138         cudaMemcpy(data, resultBuff, task->idxCount*2*sizeof(uint64), cudaMemcpyHostToDevice);
139         cudaMemcpy(stPlain, buff, PLAIN_MAX_SIZE, cudaMemcpyHostToDevice);
140         cudaMemcpy(stPlain, task->stPlain, min(task->stPlainSize, PLAIN_MAX_SIZE), cudaMemcpyHostToDevice);
141         cudaMemcpy(dimVec, task->dimVec, min(task->dimVecSize, PLAIN_MAX_SIZE)*sizeof(uint3), cudaMemcpyHostToDevice);
142         cudaMemcpy(charSet, task->charSet, min(task->charSetSize, CHAR_SET_MAXLEN), cudaMemcpyHostToDevice);
143         cudaMemcpy(cpPos, task->cpPositions, task->cpPosSize*sizeof(int), cudaMemcpyHostToDevice);
144
145         cudaMemcpyToSymbol(dataHeap, &data, sizeof(data));
146         cudaMemcpyToSymbol(plStart, &stPlain, sizeof(stPlain));
147         cudaMemcpyToSymbol(plDimVec, &dimVec, sizeof(dimVec));
148         cudaMemcpyToSymbol(plChrSet, &charSet, sizeof(charSet));
149         cudaMemcpyToSymbol(plCpPos, &cpPos, sizeof(cpPos));
150         cudaMemcpyToSymbol(plCpPosSize, &task->cpPosSize, sizeof(task->cpPosSize));
151         cudaMemcpyToSymbol(reduceOffset, &task->reduceOffset, sizeof(task->reduceOffset));
152         cudaMemcpyToSymbol(plainSpaceTotal, &task->plainSpaceTotal, sizeof(task->plainSpaceTotal));
153         uiVal64 = (uint64)-1/task->plainSpaceTotal;
154         cudaMemcpyToSymbol(rPlainSpaceTotal, &uiVal64, sizeof(uiVal64));
155
156         int grSizeX = (task->idxCount-1)/BLOCK_X_SIZE + 1;
157         int grSizeY = (grSizeX-1)/GRID_X_SIZE + 1;
158         grSizeX = GRID_X_SIZE;
159         dim3 numBlocks(grSizeX, grSizeY);
160         cuErr = cudaSuccess;
161         
162         cudaSetDeviceFlags(cudaDeviceBlockingSync);
163         for(int idx = 0; idx < task->rainbowChainLen-1 && cuErr == cudaSuccess; idx+=KERN_CHAIN_SIZE) {
164                 switch(task->hash) {
165                 case rcuda::RHASH_MD5:
166                         RTGenMD5Kernel<<<numBlocks, BLOCK_X_SIZE>>>(idx, min(idx+KERN_CHAIN_SIZE, task->rainbowChainLen-1));
167                         break;
168                 case rcuda::RHASH_MD4:
169                         RTGenMD4Kernel<<<numBlocks, BLOCK_X_SIZE>>>(idx, min(idx+KERN_CHAIN_SIZE, task->rainbowChainLen-1));
170                         break;
171                 case rcuda::RHASH_NTLM:
172                         RTGenNTLMKernel<<<numBlocks, BLOCK_X_SIZE>>>(idx, min(idx+KERN_CHAIN_SIZE, task->rainbowChainLen-1));
173                         break;
174                 case rcuda::RHASH_SHA1:
175                         RTGenSHA1Kernel<<<numBlocks, BLOCK_X_SIZE>>>(idx, min(idx+KERN_CHAIN_SIZE, task->rainbowChainLen-1));
176                         break;
177                 case rcuda::RHASH_LM:
178                         RTGenLMKernel<<<numBlocks, BLOCK_X_SIZE>>>(idx, min(idx+KERN_CHAIN_SIZE, task->rainbowChainLen-1));
179                         break;
180                 }
181                 cuErr = cudaGetLastError();
182                 if(cuErr == cudaSuccess)
183                         cuErr = cudaThreadSynchronize();
184         }
185
186         if(cuErr == cudaSuccess)
187                 cudaMemcpy(resultBuff, data, task->idxCount*2*sizeof(uint64), cudaMemcpyDeviceToHost);
188         else
189                 fprintf(stderr, "Error happened: %d (%s)\n", cuErr, cudaGetErrorString(cuErr)); 
190
191         cudaFree(cpPos);
192         cudaFree(charSet);
193         cudaFree(dimVec);
194         cudaFree(stPlain);
195         cudaFree(data);
196         return cuErr==cudaSuccess? task->idxCount : -1;
197 }
198
199 extern "C" int GetChainsBufferSize(int minSize) {
200         int grSizeX = (minSize-1)/BLOCK_X_SIZE + 1;
201         int grSizeY = (grSizeX-1)/GRID_X_SIZE + 1;
202         grSizeX = GRID_X_SIZE;
203         return grSizeX*grSizeY*BLOCK_X_SIZE;
204 }
205
206 extern "C" int SetCudaDevice(int device) {
207         return cudaSetDevice(device);
208 }