]> git.sesse.net Git - freerainbowtables/blob - BOINC software/BOINC client apps/distrrtgen_cuda/rcuda.cu
e626bd4148792d9129ce86c5715aa73d6430820a
[freerainbowtables] / BOINC software / BOINC client apps / distrrtgen_cuda / rcuda.cu
1 #include <stdio.h>  
2 #include <cuda.h>  
3 #include "rcuda.h"
4
5 #define   GRID_X_L2     6
6 #define   GRID_Y_L2     6
7 #define   BLOCK_X_L2    7
8 #define   GRID_X_SIZE   (1<<GRID_X_L2)
9 #define   GRID_Y_SIZE   (1<<GRID_Y_L2)
10 #define   BLOCK_X_SIZE  (1<<BLOCK_X_L2)
11 #define   KERN_CHAIN_SIZE   100
12 #define   SHIDX(x)      ((x)<<4)
13
14
15 __device__ __constant__ unsigned int h[4] = { 0x67452301, 0xEFCDAB89, 0x98BADCFE, 0x10325476 };
16 __device__ __constant__ unsigned char r[64] = { \
17                                                         7, 12, 17, 22,  7, 12, 17, 22,  7, 12, 17, 22,  7, 12, 17, 22, \
18                                                         5,  9, 14, 20,  5,  9, 14, 20,  5,  9, 14, 20,  5,  9, 14, 20, \
19                                                         4, 11, 16, 23,  4, 11, 16, 23,  4, 11, 16, 23,  4, 11, 16, 23, \
20                                                         6, 10, 15, 21,  6, 10, 15, 21,  6, 10, 15, 21,  6, 10, 15, 21 };
21 __device__ __constant__ unsigned char g[64] = { \
22                                                         0, 1,  2,  3,   4,  5,  6,  7,   8,  9, 10, 11,  12, 13, 14, 15, \
23                                                         1, 6, 11,  0,   5, 10, 15,  4,   9, 14,  3,  8,  13,  2,  7, 12, \
24                                                         5, 8, 11, 14,   1,  4,  7, 10,  13,  0,  3,  6,   9, 12, 15,  2, \
25                                                         0, 7, 14,  5,  12,  3, 10,  1,   8, 15,  6, 13,   4, 11,  2,  9 };
26 __device__ __constant__ unsigned int ac[64] = { \
27                                                         0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee, \
28                                                         0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501, \
29                                                         0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be, \
30                                                         0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821, \
31                                                         \
32                                                         0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa, \
33                                                         0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8, \
34                                                         0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed, \
35                                                         0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a, \
36                                                         \
37                                                         0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c, \
38                                                         0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70, \
39                                                         0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05, \
40                                                         0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665, \
41                                                         \
42                                                         0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039, \
43                                                         0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1, \
44                                                         0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1, \
45                                                         0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391 };
46
47 __device__ __constant__ unsigned int testData[16] = { \
48                                                         0x79706d63, 0x6d627667, 0x00000080, 0x00000000, \
49                                                         0x00000000, 0x00000000, 0x00000000, 0x00000000, \
50                                                         0x00000000, 0x00000000, 0x00000000, 0x00000000, \
51                                                         0x00000000, 0x00000000, 0x00000040, 0x00000000 };
52
53 __device__ unsigned int FF(unsigned int a, unsigned int b, unsigned int c, unsigned int d, int i, const unsigned int* data) {
54         unsigned int ret;
55         ret = a + ((b&c)|((~b)&d)) + ac[i] + data[SHIDX(g[i])];
56         ret = (ret<<r[i])|(ret>>(32-r[i]));
57         ret += b;
58         return ret;
59 }
60
61 __device__ unsigned int GG(unsigned int a, unsigned int b, unsigned int c, unsigned int d, int i, const unsigned int* data) {
62         unsigned int ret;
63         ret = a + ((b&d)|(c&(~d))) + ac[i] + data[SHIDX(g[i])];
64         ret = (ret<<r[i])|(ret>>(32-r[i]));
65         ret += b;
66         return ret;
67 }
68
69 __device__ unsigned int HH(unsigned int a, unsigned int b, unsigned int c, unsigned int d, int i, const unsigned int* data) {
70         unsigned int ret;
71         ret = a + (b^c^d) + ac[i] + data[SHIDX(g[i])];
72         ret = (ret<<r[i])|(ret>>(32-r[i]));
73         ret += b;
74         return ret;
75 }
76
77 __device__ unsigned int II(unsigned int a, unsigned int b, unsigned int c, unsigned int d, int i, const unsigned int* data) {
78         unsigned int ret;
79         ret = a + (c^(b|(~d))) + ac[i] + data[SHIDX(g[i])];
80         ret = (ret<<r[i])|(ret>>(32-r[i]));
81         ret += b;
82         return ret;
83 }
84
85 __device__ void MD5(unsigned int* dataHash) {
86         unsigned int a = h[0], b = h[1], c = h[2], d = h[3], x;
87         int ii;
88
89         // Round 1
90         for(ii = 0; ii < 16; ii++) {
91                 x = b;
92                 b = FF(a, b, c, d, ii, dataHash);
93                 a = d; d = c; c = x;
94         }
95
96         // Round 2
97         for(; ii < 32; ii++) {
98                 x = b;
99                 b = GG(a, b, c, d, ii, dataHash);
100                 a = d; d = c; c = x;
101         }
102         
103         // Round 3
104         for(; ii < 48; ii++) {
105                 x = b;
106                 b = HH(a, b, c, d, ii, dataHash);
107                 a = d; d = c; c = x;
108         }
109         
110         // Round 4
111         for(; ii < 64; ii++) {
112                 x = b;
113                 b = II(a, b, c, d, ii, dataHash);
114                 a = d; d = c; c = x;
115         }
116
117         dataHash[SHIDX(0)] = a + h[0];
118         dataHash[SHIDX(1)] = b + h[1];
119         dataHash[SHIDX(2)] = c + h[2];
120         dataHash[SHIDX(3)] = d + h[3];
121 }
122
123
124 #define   PLAIN_MAX_SIZE     20
125
126 __device__ unsigned __int64 *dataHeap;
127 __device__ unsigned char *plStart;
128 __device__ uint3 *plDimVec;
129 __device__ unsigned char *plChrSet;
130 __device__ int *plCpPos;
131 __device__ int plCpPosSize;
132 __device__ unsigned __int64 reduceOffset;
133 __device__ unsigned __int64 plainSpaceTotal;
134 __device__ unsigned __int64 rPlainSpaceTotal;
135
136
137 __global__ void RTGenMD5Kernel(unsigned int chainStart, unsigned int chainStop) {
138         unsigned int *hData;
139         uint3 dimItem;
140         unsigned int uiVal, uiMul, uiDiv, idx;
141         unsigned __int64 uiVal64, uiMul64, uiDiv64, idx64;
142         unsigned int nPos, size, ii, jj, kk;
143         unsigned int cpcheck, checkpoint;
144         unsigned int plain;
145
146         __shared__ unsigned int shData[SHIDX(BLOCK_X_SIZE)];
147
148         if(threadIdx.x == 0) {
149                 nPos = ((((blockIdx.y<<GRID_X_L2) + blockIdx.x)<<BLOCK_X_L2) + threadIdx.x) << 1;
150                 for(ii = 0; ii < BLOCK_X_SIZE; ii++, nPos+=2) {
151                         hData = shData + ((ii>>4)<<8)+(ii&15);
152                         hData[SHIDX(0)] = dataHeap[nPos];
153                         hData[SHIDX(1)] = dataHeap[nPos]>>32;
154                         hData[SHIDX(2)] = dataHeap[nPos+1];
155                 }
156         }
157         __syncthreads();
158
159         hData = shData + ((threadIdx.x>>4)<<8)+(threadIdx.x&15);
160         
161         idx64 = hData[SHIDX(1)];
162         idx64 = (idx64<<32) | hData[SHIDX(0)];
163         cpcheck = hData[SHIDX(2)];
164         checkpoint = cpcheck&0x0000ffff;
165         cpcheck = cpcheck>>16;
166
167         for(nPos = chainStart; nPos < chainStop; nPos++) {
168                 // transform to the plain text
169                 plain = 0x80;
170                 jj = (PLAIN_MAX_SIZE>>2)+1;
171                 for(ii = 0; idx64 > 0xfffffff0ull && ii < PLAIN_MAX_SIZE; ii++) {
172                         uiVal64 = idx64 + plStart[ii];
173                         uiVal64--;
174                         dimItem = plDimVec[ii];
175                         uiMul64 = (unsigned __int64)dimItem.y<<32;
176                         idx64 = __umul64hi(uiVal64, uiMul64);
177                         uiDiv64 = uiVal64 - idx64*(unsigned __int64)dimItem.x;
178                         uiVal = __umulhi((unsigned int)uiDiv64, dimItem.y);
179                         uiDiv = (unsigned int)uiDiv64 - uiVal * dimItem.x;
180                         idx64 += uiVal;
181                         if(uiDiv >= dimItem.x) {
182                                 uiDiv -= dimItem.x;
183                                 idx64++;
184                         }
185                         plain = (plain<<8) | plChrSet[dimItem.z + uiDiv];
186                         if((ii&3) == 2) {
187                                 hData[SHIDX(jj--)] = plain;
188                                 plain = 0;
189                         }
190                 }
191                 for(idx = (unsigned int)idx64; idx != 0 && ii < PLAIN_MAX_SIZE; ii++) {
192                         uiVal = idx + plStart[ii];
193                         uiVal--;
194                         dimItem = plDimVec[ii];
195                         idx = __umulhi(uiVal, dimItem.y);
196                         uiDiv = uiVal - idx*dimItem.x;
197                         if(uiDiv >= dimItem.x) {
198                                 uiDiv -= dimItem.x;
199                                 idx++;
200                         }
201                         plain = (plain<<8) | plChrSet[dimItem.z + uiDiv];
202                         if((ii&3) == 2) {
203                                 hData[SHIDX(jj--)] = plain;
204                                 plain = 0;
205                         }
206                 }
207
208                 // prepare for MD5
209                 size = ii;
210                 ii = ((((3-(ii&3))<<3)-1)&0x1f)+1;
211                 plain = plain<<ii;
212                 for(jj++, kk = 0; jj <= (PLAIN_MAX_SIZE>>2)+1; plain = hData[SHIDX(jj++)], kk++)
213                         hData[SHIDX(kk)] = (plain>>ii)|(hData[SHIDX(jj)]<<(32-ii));
214                 hData[SHIDX(kk)] = plain>>ii;
215                 for(kk++; kk < 14; kk++)
216                         hData[SHIDX(kk)] = 0;
217                 hData[SHIDX(kk++)] = size<<3;
218                 hData[SHIDX(kk)] = 0;
219
220                 // hash
221                 MD5(hData);
222
223                 idx64 = hData[SHIDX(1)];
224                 idx64 = (idx64<<32) | hData[SHIDX(0)];
225                 idx64 += reduceOffset + nPos;
226                 uiDiv64 = __umul64hi(idx64, rPlainSpaceTotal);
227                 idx64 -= uiDiv64*plainSpaceTotal;
228                 if(idx64 >= plainSpaceTotal)
229                         idx64 -= plainSpaceTotal;
230                         
231                 if(cpcheck < plCpPosSize && nPos == plCpPos[cpcheck]) {
232                         checkpoint |= ((unsigned int)idx64&1) << cpcheck;
233                         cpcheck++;
234                 }
235         }
236
237         hData[SHIDX(0)] = idx64;
238         hData[SHIDX(1)] = idx64>>32;
239         hData[SHIDX(2)] = (cpcheck<<16)|(checkpoint&0x0000ffff);
240         __syncthreads();
241         
242         if(threadIdx.x == 0) {
243                 nPos = ((((blockIdx.y<<GRID_X_L2) + blockIdx.x)<<BLOCK_X_L2) + threadIdx.x) << 1;
244                 for(ii = 0; ii < BLOCK_X_SIZE; ii++, nPos+=2) {
245                         hData = shData + ((ii>>4)<<8)+(ii&15);
246                         dataHeap[nPos] = ((unsigned __int64)hData[SHIDX(1)]<<32)|(unsigned __int64)hData[SHIDX(0)];
247                         dataHeap[nPos+1] = hData[SHIDX(2)];
248                 }
249         }
250         __syncthreads();
251 }
252
253
254 extern "C" int CalcChainsOnCUDA(const rcuda::RCudaTask* task, unsigned __int64 *resultBuff) {
255         cudaError_t cuErr;
256         char buff[PLAIN_MAX_SIZE];
257         unsigned __int64 *data;
258         unsigned char *stPlain;
259         uint3 *dimVec;
260         unsigned char *charSet;
261         int *cpPos;
262         unsigned __int64 uiVal64;
263         time_t tStart, tEnd;
264         if(task->hash != rcuda::RHASH_MD5)
265                 return 0;
266
267         memset(buff, 0, PLAIN_MAX_SIZE);
268         cudaMalloc((void**)&data, task->idxCount*2*sizeof(unsigned __int64));
269         cudaMalloc((void**)&stPlain, PLAIN_MAX_SIZE);
270         cudaMalloc((void**)&dimVec, task->dimVecSize*sizeof(uint3));
271         cudaMalloc((void**)&charSet, task->charSetSize);
272         cudaMalloc((void**)&cpPos, task->cpPosSize*sizeof(int));
273
274         cudaMemcpy(data, resultBuff, task->idxCount*2*sizeof(unsigned __int64), cudaMemcpyHostToDevice);
275         cudaMemcpy(stPlain, buff, PLAIN_MAX_SIZE, cudaMemcpyHostToDevice);
276         cudaMemcpy(stPlain, task->stPlain, min(task->stPlainSize, PLAIN_MAX_SIZE), cudaMemcpyHostToDevice);
277         cudaMemcpy(dimVec, task->dimVec, task->dimVecSize*sizeof(uint3), cudaMemcpyHostToDevice);
278         cudaMemcpy(charSet, task->charSet, task->charSetSize, cudaMemcpyHostToDevice);
279         cudaMemcpy(cpPos, task->cpPositions, task->cpPosSize*sizeof(int), cudaMemcpyHostToDevice);
280
281         cudaMemcpyToSymbol(dataHeap, &data, sizeof(data));
282         cudaMemcpyToSymbol(plStart, &stPlain, sizeof(stPlain));
283         cudaMemcpyToSymbol(plDimVec, &dimVec, sizeof(dimVec));
284         cudaMemcpyToSymbol(plChrSet, &charSet, sizeof(charSet));
285         cudaMemcpyToSymbol(plCpPos, &cpPos, sizeof(cpPos));
286         cudaMemcpyToSymbol(plCpPosSize, &task->cpPosSize, sizeof(task->cpPosSize));
287         cudaMemcpyToSymbol(reduceOffset, &task->reduceOffset, sizeof(task->reduceOffset));
288         cudaMemcpyToSymbol(plainSpaceTotal, &task->plainSpaceTotal, sizeof(task->plainSpaceTotal));
289         uiVal64 = (unsigned __int64)-1/task->plainSpaceTotal;
290         cudaMemcpyToSymbol(rPlainSpaceTotal, &uiVal64, sizeof(uiVal64));
291
292         int grSizeX = (task->idxCount-1)/BLOCK_X_SIZE + 1;
293         int grSizeY = (grSizeX-1)/GRID_X_SIZE + 1;
294         grSizeX = GRID_X_SIZE;
295         dim3 numBlocks(grSizeX, grSizeY);
296         cuErr = cudaSuccess;
297         tStart = time(NULL);
298         for(int idx = 0; idx < task->rainbowChainLen-1 && cuErr == cudaSuccess; idx+=KERN_CHAIN_SIZE) {
299                 RTGenMD5Kernel<<<numBlocks, BLOCK_X_SIZE>>>(idx, min(idx+KERN_CHAIN_SIZE, task->rainbowChainLen-1));
300                 cuErr = cudaGetLastError();
301                 if(cuErr == cudaSuccess)
302                         cuErr = cudaThreadSynchronize();
303                 
304         }
305         tEnd = time(NULL);
306         fprintf(stderr, "Kernel run time: %i\n", (tEnd - tStart));
307
308         if(cuErr == cudaSuccess)
309                 cudaMemcpy(resultBuff, data, task->idxCount*2*sizeof(unsigned __int64), cudaMemcpyDeviceToHost);
310         else
311                 fprintf(stderr, "Error happened: %d (%s)\n", cuErr, cudaGetErrorString(cuErr)); 
312
313         cudaFree(cpPos);
314         cudaFree(charSet);
315         cudaFree(dimVec);
316         cudaFree(stPlain);
317         cudaFree(data);
318         return cuErr==cudaSuccess? task->idxCount : -1;
319 }
320
321 extern "C" int GetChainsBufferSize(int minSize) {
322         int grSizeX = (minSize-1)/BLOCK_X_SIZE + 1;
323         int grSizeY = (grSizeX-1)/GRID_X_SIZE + 1;
324         grSizeX = GRID_X_SIZE;
325         return grSizeX*grSizeY*BLOCK_X_SIZE;
326 }