git.sesse.net Git - freerainbowtables/blob - BOINC software/BOINC client apps/distrrtgen_cuda/rcuda.cu

   1 //============================================================================
   2 // Name        : rcuda.cu
   3 // Author      : Jan Kyska
   4 // Version     : 1.00
   5 // Description : Generator of FreeRainbowTables / MD5, MD4, NTLM, SHA1, LM
   6 //============================================================================
   7
   8 #include <stdio.h>
   9 #include <cuda.h>
  10 #include "rcuda.h"
  11
  12 #define   GRID_X_L2     6
  13 #define   GRID_Y_L2     6
  14 #define   BLOCK_X_L2    6
  15 #define   GRID_X_SIZE   (1<<GRID_X_L2)
  16 #define   GRID_Y_SIZE   (1<<GRID_Y_L2)
  17 #define   BLOCK_X_SIZE  (1<<BLOCK_X_L2)
  18 #define   PLAIN_MAX_SIZE     20
  19 #define   KERN_CHAIN_SIZE   100
  20 #define   CHAR_SET_MAXLEN   512
  21 #define   SHIDX(x)      ((x)<<4)
  22
  23
  24 __device__ uint64 *dataHeap;
  25 __device__ unsigned char *plStart;
  26 __device__ uint3 *plDimVec;
  27 __device__ unsigned char *plChrSet;
  28 __device__ int *plCpPos;
  29 __device__ int plCpPosSize;
  30 __device__ uint64 reduceOffset;
  31 __device__ uint64 plainSpaceTotal;
  32 __device__ uint64 rPlainSpaceTotal;
  33
  34 #define RTGEN_PROLOGUE  \
  35         unsigned int *hData;  \
  36         unsigned int idx;  \
  37         uint64 idx64;  \
  38         unsigned int nPos, ii;  \
  39         unsigned int cpcheck, checkpoint;  \
  40         \
  41         __shared__ unsigned int shData[SHIDX(BLOCK_X_SIZE)];  \
  42         __shared__ unsigned char cplChrSet[CHAR_SET_MAXLEN];  \
  43         __shared__ unsigned char cplStart[PLAIN_MAX_SIZE];  \
  44         __shared__ uint3 cplDimVec[PLAIN_MAX_SIZE];  \
  45         \
  46         if(threadIdx.x == 0) {  \
  47                 nPos = ((((blockIdx.y<<GRID_X_L2) + blockIdx.x)<<BLOCK_X_L2) + threadIdx.x) << 1;  \
  48                 for(ii = 0; ii < BLOCK_X_SIZE; ii++, nPos+=2) {  \
  49                         hData = shData + ((ii>>4)<<8)+(ii&15);  \
  50                         hData[SHIDX(0)] = dataHeap[nPos];  \
  51                         hData[SHIDX(1)] = dataHeap[nPos]>>32;  \
  52                         hData[SHIDX(2)] = dataHeap[nPos+1];  \
  53                 }  \
  54                 memcpy(cplChrSet, plChrSet, CHAR_SET_MAXLEN);  \
  55                 memcpy(cplStart, plStart, PLAIN_MAX_SIZE);  \
  56                 memcpy(cplDimVec, plDimVec, PLAIN_MAX_SIZE*sizeof(uint3));  \
  57         }  \
  58         __syncthreads();  \
  59         \
  60         hData = shData + ((threadIdx.x>>4)<<8)+(threadIdx.x&15);  \
  61         \
  62         idx64 = hData[SHIDX(1)];  \
  63         idx64 = (idx64<<32) | hData[SHIDX(0)];  \
  64         cpcheck = hData[SHIDX(2)];  \
  65         checkpoint = cpcheck&0x0000ffff;  \
  66         cpcheck = cpcheck>>16;  \
  67         \
  68         for(nPos = chainStart; nPos < chainStop; nPos++) {
  69
  70
  71 #define RTGEN_EPILOGUE  \
  72                 idx64 = hData[SHIDX(1)];  \
  73                 idx64 = (idx64<<32) | hData[SHIDX(0)];  \
  74                 idx64 += reduceOffset + nPos;  \
  75                 uiDiv64 = __umul64hi(idx64, rPlainSpaceTotal);  \
  76                 idx64 -= uiDiv64*plainSpaceTotal;  \
  77                 if(idx64 >= plainSpaceTotal)  \
  78                         idx64 -= plainSpaceTotal;  \
  79                 \
  80                 if(cpcheck < plCpPosSize && nPos == plCpPos[cpcheck]) {  \
  81                         checkpoint |= ((unsigned int)idx64&1) << cpcheck;  \
  82                         cpcheck++;  \
  83                 }  \
  84         }  \
  85         \
  86         hData[SHIDX(0)] = idx64;  \
  87         hData[SHIDX(1)] = idx64>>32;  \
  88         hData[SHIDX(2)] = (cpcheck<<16)|(checkpoint&0x0000ffff);  \
  89         __syncthreads();  \
  90         \
  91         if(threadIdx.x == 0) {  \
  92                 nPos = ((((blockIdx.y<<GRID_X_L2) + blockIdx.x)<<BLOCK_X_L2) + threadIdx.x) << 1;  \
  93                 for(ii = 0; ii < BLOCK_X_SIZE; ii++, nPos+=2) {  \
  94                         hData = shData + ((ii>>4)<<8)+(ii&15);  \
  95                         dataHeap[nPos] = ((uint64)hData[SHIDX(1)]<<32)|(uint64)hData[SHIDX(0)];  \
  96                         dataHeap[nPos+1] = hData[SHIDX(2)];  \
  97                 }  \
  98         }
  99
 100
 101 #include "rcuda_md5.inc"
 102 #include "rcuda_md4.inc"
 103 #include "rcuda_ntlm.inc"
 104 #include "rcuda_sha1.inc"
 105 #include "rcuda_lm.inc"
 106
 107 extern "C" int CalcChainsOnCUDA(const rcuda::RCudaTask* task, uint64 *resultBuff) {
 108         cudaError_t cuErr;
 109         char buff[PLAIN_MAX_SIZE];
 110         uint64 *data;
 111         unsigned char *stPlain;
 112         uint3 *dimVec;
 113         unsigned char *charSet;
 114         int *cpPos;
 115         uint64 uiVal64;
 116
 117         if(task->charSetSize > CHAR_SET_MAXLEN)
 118                 return -1;
 119
 120         switch(task->hash) {
 121         case rcuda::RHASH_MD5:
 122         case rcuda::RHASH_MD4:
 123         case rcuda::RHASH_NTLM:
 124         case rcuda::RHASH_SHA1:
 125         case rcuda::RHASH_LM:
 126                 break;
 127         default:
 128                 return 0;
 129         }
 130
 131         memset(buff, 0, PLAIN_MAX_SIZE);
 132         cudaMalloc((void**)&data, task->idxCount*2*sizeof(uint64));
 133         cudaMalloc((void**)&stPlain, PLAIN_MAX_SIZE);
 134         cudaMalloc((void**)&dimVec, max(task->dimVecSize, PLAIN_MAX_SIZE)*sizeof(uint3));
 135         cudaMalloc((void**)&charSet, CHAR_SET_MAXLEN);
 136         cudaMalloc((void**)&cpPos, task->cpPosSize*sizeof(int));
 137
 138         cudaMemcpy(data, resultBuff, task->idxCount*2*sizeof(uint64), cudaMemcpyHostToDevice);
 139         cudaMemcpy(stPlain, buff, PLAIN_MAX_SIZE, cudaMemcpyHostToDevice);
 140         cudaMemcpy(stPlain, task->stPlain, min(task->stPlainSize, PLAIN_MAX_SIZE), cudaMemcpyHostToDevice);
 141         cudaMemcpy(dimVec, task->dimVec, min(task->dimVecSize, PLAIN_MAX_SIZE)*sizeof(uint3), cudaMemcpyHostToDevice);
 142         cudaMemcpy(charSet, task->charSet, min(task->charSetSize, CHAR_SET_MAXLEN), cudaMemcpyHostToDevice);
 143         cudaMemcpy(cpPos, task->cpPositions, task->cpPosSize*sizeof(int), cudaMemcpyHostToDevice);
 144
 145         cudaMemcpyToSymbol(dataHeap, &data, sizeof(data));
 146         cudaMemcpyToSymbol(plStart, &stPlain, sizeof(stPlain));
 147         cudaMemcpyToSymbol(plDimVec, &dimVec, sizeof(dimVec));
 148         cudaMemcpyToSymbol(plChrSet, &charSet, sizeof(charSet));
 149         cudaMemcpyToSymbol(plCpPos, &cpPos, sizeof(cpPos));
 150         cudaMemcpyToSymbol(plCpPosSize, &task->cpPosSize, sizeof(task->cpPosSize));
 151         cudaMemcpyToSymbol(reduceOffset, &task->reduceOffset, sizeof(task->reduceOffset));
 152         cudaMemcpyToSymbol(plainSpaceTotal, &task->plainSpaceTotal, sizeof(task->plainSpaceTotal));
 153         uiVal64 = (uint64)-1/task->plainSpaceTotal;
 154         cudaMemcpyToSymbol(rPlainSpaceTotal, &uiVal64, sizeof(uiVal64));
 155
 156         int grSizeX = (task->idxCount-1)/BLOCK_X_SIZE + 1;
 157         int grSizeY = (grSizeX-1)/GRID_X_SIZE + 1;
 158         grSizeX = GRID_X_SIZE;
 159         dim3 numBlocks(grSizeX, grSizeY);
 160         cuErr = cudaSuccess;
 161
 162         cudaSetDeviceFlags(cudaDeviceBlockingSync);
 163
 164         for(int idx = 0; idx < task->rainbowChainLen-1 && cuErr == cudaSuccess; idx+=KERN_CHAIN_SIZE) {
 165                 switch(task->hash) {
 166                 case rcuda::RHASH_MD5:
 167                         RTGenMD5Kernel<<<numBlocks, BLOCK_X_SIZE>>>(idx, min(idx+KERN_CHAIN_SIZE, task->rainbowChainLen-1));
 168                         break;
 169                 case rcuda::RHASH_MD4:
 170                         RTGenMD4Kernel<<<numBlocks, BLOCK_X_SIZE>>>(idx, min(idx+KERN_CHAIN_SIZE, task->rainbowChainLen-1));
 171                         break;
 172                 case rcuda::RHASH_NTLM:
 173                         RTGenNTLMKernel<<<numBlocks, BLOCK_X_SIZE>>>(idx, min(idx+KERN_CHAIN_SIZE, task->rainbowChainLen-1));
 174                         break;
 175                 case rcuda::RHASH_SHA1:
 176                         RTGenSHA1Kernel<<<numBlocks, BLOCK_X_SIZE>>>(idx, min(idx+KERN_CHAIN_SIZE, task->rainbowChainLen-1));
 177                         break;
 178                 case rcuda::RHASH_LM:
 179                         RTGenLMKernel<<<numBlocks, BLOCK_X_SIZE>>>(idx, min(idx+KERN_CHAIN_SIZE, task->rainbowChainLen-1));
 180                         break;
 181                 }
 182                 cuErr = cudaGetLastError();
 183                 if(cuErr == cudaSuccess)
 184                         cuErr = cudaThreadSynchronize();
 185         }
 186
 187         if(cuErr == cudaSuccess)
 188                 cudaMemcpy(resultBuff, data, task->idxCount*2*sizeof(uint64), cudaMemcpyDeviceToHost);
 189         else
 190                 fprintf(stderr, "Error happened: %d (%s)\n", cuErr, cudaGetErrorString(cuErr));
 191
 192         cudaFree(cpPos);
 193         cudaFree(charSet);
 194         cudaFree(dimVec);
 195         cudaFree(stPlain);
 196         cudaFree(data);
 197         return cuErr==cudaSuccess? task->idxCount : -1;
 198 }
 199
 200 extern "C" int GetChainsBufferSize(int minSize) {
 201         int grSizeX = (minSize-1)/BLOCK_X_SIZE + 1;
 202         int grSizeY = (grSizeX-1)/GRID_X_SIZE + 1;
 203         grSizeX = GRID_X_SIZE;
 204         return grSizeX*grSizeY*BLOCK_X_SIZE;
 205 }
 206
 207 extern "C" int SetCudaDevice(int device) {
 208         return cudaSetDevice(device);
 209 }