X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=BOINC%20software%2FBOINC%20client%20apps%2Fdistrrtgen_cuda%2Frcuda.cu;h=b626041ede05721baf29f545226e9edcb5c9a524;hb=683eeaa26cde6b9faa7600497da5896319583692;hp=e626bd4148792d9129ce86c5715aa73d6430820a;hpb=38fa2bdbfac7674740338b5afecd1e8e48353ec4;p=freerainbowtables diff --git a/BOINC software/BOINC client apps/distrrtgen_cuda/rcuda.cu b/BOINC software/BOINC client apps/distrrtgen_cuda/rcuda.cu index e626bd4..b626041 100644 --- a/BOINC software/BOINC client apps/distrrtgen_cuda/rcuda.cu +++ b/BOINC software/BOINC client apps/distrrtgen_cuda/rcuda.cu @@ -1,281 +1,145 @@ +//============================================================================ +// Name : rcuda.cu +// Author : Jan Kyska +// Version : 1.00 +// Description : Generator of FreeRainbowTables / MD5, MD4, NTLM, SHA1, LM +//============================================================================ + #include #include #include "rcuda.h" #define GRID_X_L2 6 #define GRID_Y_L2 6 -#define BLOCK_X_L2 7 +#define BLOCK_X_L2 6 #define GRID_X_SIZE (1<>(32-r[i])); - ret += b; - return ret; -} - -__device__ unsigned int GG(unsigned int a, unsigned int b, unsigned int c, unsigned int d, int i, const unsigned int* data) { - unsigned int ret; - ret = a + ((b&d)|(c&(~d))) + ac[i] + data[SHIDX(g[i])]; - ret = (ret<>(32-r[i])); - ret += b; - return ret; -} - -__device__ unsigned int HH(unsigned int a, unsigned int b, unsigned int c, unsigned int d, int i, const unsigned int* data) { - unsigned int ret; - ret = a + (b^c^d) + ac[i] + data[SHIDX(g[i])]; - ret = (ret<>(32-r[i])); - ret += b; - return ret; -} - -__device__ unsigned int II(unsigned int a, unsigned int b, unsigned int c, unsigned int d, int i, const unsigned int* data) { - unsigned int ret; - ret = a + (c^(b|(~d))) + ac[i] + data[SHIDX(g[i])]; - ret = (ret<>(32-r[i])); - ret += b; - return ret; -} - -__device__ void MD5(unsigned int* dataHash) { - unsigned int a = h[0], b = h[1], c = h[2], d = h[3], x; - int ii; - - // Round 1 - for(ii = 0; ii < 16; ii++) { - x = b; - b = FF(a, b, c, d, ii, dataHash); - a = d; d = c; c = x; - } - - // Round 2 - for(; ii < 32; ii++) { - x = b; - b = GG(a, b, c, d, ii, dataHash); - a = d; d = c; c = x; - } - - // Round 3 - for(; ii < 48; ii++) { - x = b; - b = HH(a, b, c, d, ii, dataHash); - a = d; d = c; c = x; - } - - // Round 4 - for(; ii < 64; ii++) { - x = b; - b = II(a, b, c, d, ii, dataHash); - a = d; d = c; c = x; - } - - dataHash[SHIDX(0)] = a + h[0]; - dataHash[SHIDX(1)] = b + h[1]; - dataHash[SHIDX(2)] = c + h[2]; - dataHash[SHIDX(3)] = d + h[3]; -} - - -#define PLAIN_MAX_SIZE 20 - -__device__ unsigned __int64 *dataHeap; +__device__ uint64 *dataHeap; __device__ unsigned char *plStart; __device__ uint3 *plDimVec; __device__ unsigned char *plChrSet; __device__ int *plCpPos; __device__ int plCpPosSize; -__device__ unsigned __int64 reduceOffset; -__device__ unsigned __int64 plainSpaceTotal; -__device__ unsigned __int64 rPlainSpaceTotal; - - -__global__ void RTGenMD5Kernel(unsigned int chainStart, unsigned int chainStop) { - unsigned int *hData; - uint3 dimItem; - unsigned int uiVal, uiMul, uiDiv, idx; - unsigned __int64 uiVal64, uiMul64, uiDiv64, idx64; - unsigned int nPos, size, ii, jj, kk; - unsigned int cpcheck, checkpoint; - unsigned int plain; - - __shared__ unsigned int shData[SHIDX(BLOCK_X_SIZE)]; - - if(threadIdx.x == 0) { - nPos = ((((blockIdx.y<>4)<<8)+(ii&15); - hData[SHIDX(0)] = dataHeap[nPos]; - hData[SHIDX(1)] = dataHeap[nPos]>>32; - hData[SHIDX(2)] = dataHeap[nPos+1]; - } - } - __syncthreads(); - - hData = shData + ((threadIdx.x>>4)<<8)+(threadIdx.x&15); - - idx64 = hData[SHIDX(1)]; - idx64 = (idx64<<32) | hData[SHIDX(0)]; - cpcheck = hData[SHIDX(2)]; - checkpoint = cpcheck&0x0000ffff; - cpcheck = cpcheck>>16; - +__device__ uint64 reduceOffset; +__device__ uint64 plainSpaceTotal; +__device__ uint64 rPlainSpaceTotal; + +#define RTGEN_PROLOGUE \ + unsigned int *hData; \ + unsigned int idx; \ + uint64 idx64; \ + unsigned int nPos, ii; \ + unsigned int cpcheck, checkpoint; \ + \ + __shared__ unsigned int shData[SHIDX(BLOCK_X_SIZE)]; \ + __shared__ unsigned char cplChrSet[CHAR_SET_MAXLEN]; \ + __shared__ unsigned char cplStart[PLAIN_MAX_SIZE]; \ + __shared__ uint3 cplDimVec[PLAIN_MAX_SIZE]; \ + \ + if(threadIdx.x == 0) { \ + nPos = ((((blockIdx.y<>4)<<8)+(ii&15); \ + hData[SHIDX(0)] = dataHeap[nPos]; \ + hData[SHIDX(1)] = dataHeap[nPos]>>32; \ + hData[SHIDX(2)] = dataHeap[nPos+1]; \ + } \ + memcpy(cplChrSet, plChrSet, CHAR_SET_MAXLEN); \ + memcpy(cplStart, plStart, PLAIN_MAX_SIZE); \ + memcpy(cplDimVec, plDimVec, PLAIN_MAX_SIZE*sizeof(uint3)); \ + } \ + __syncthreads(); \ + \ + hData = shData + ((threadIdx.x>>4)<<8)+(threadIdx.x&15); \ + \ + idx64 = hData[SHIDX(1)]; \ + idx64 = (idx64<<32) | hData[SHIDX(0)]; \ + cpcheck = hData[SHIDX(2)]; \ + checkpoint = cpcheck&0x0000ffff; \ + cpcheck = cpcheck>>16; \ + \ for(nPos = chainStart; nPos < chainStop; nPos++) { - // transform to the plain text - plain = 0x80; - jj = (PLAIN_MAX_SIZE>>2)+1; - for(ii = 0; idx64 > 0xfffffff0ull && ii < PLAIN_MAX_SIZE; ii++) { - uiVal64 = idx64 + plStart[ii]; - uiVal64--; - dimItem = plDimVec[ii]; - uiMul64 = (unsigned __int64)dimItem.y<<32; - idx64 = __umul64hi(uiVal64, uiMul64); - uiDiv64 = uiVal64 - idx64*(unsigned __int64)dimItem.x; - uiVal = __umulhi((unsigned int)uiDiv64, dimItem.y); - uiDiv = (unsigned int)uiDiv64 - uiVal * dimItem.x; - idx64 += uiVal; - if(uiDiv >= dimItem.x) { - uiDiv -= dimItem.x; - idx64++; - } - plain = (plain<<8) | plChrSet[dimItem.z + uiDiv]; - if((ii&3) == 2) { - hData[SHIDX(jj--)] = plain; - plain = 0; - } - } - for(idx = (unsigned int)idx64; idx != 0 && ii < PLAIN_MAX_SIZE; ii++) { - uiVal = idx + plStart[ii]; - uiVal--; - dimItem = plDimVec[ii]; - idx = __umulhi(uiVal, dimItem.y); - uiDiv = uiVal - idx*dimItem.x; - if(uiDiv >= dimItem.x) { - uiDiv -= dimItem.x; - idx++; - } - plain = (plain<<8) | plChrSet[dimItem.z + uiDiv]; - if((ii&3) == 2) { - hData[SHIDX(jj--)] = plain; - plain = 0; - } - } - - // prepare for MD5 - size = ii; - ii = ((((3-(ii&3))<<3)-1)&0x1f)+1; - plain = plain<>2)+1; plain = hData[SHIDX(jj++)], kk++) - hData[SHIDX(kk)] = (plain>>ii)|(hData[SHIDX(jj)]<<(32-ii)); - hData[SHIDX(kk)] = plain>>ii; - for(kk++; kk < 14; kk++) - hData[SHIDX(kk)] = 0; - hData[SHIDX(kk++)] = size<<3; - hData[SHIDX(kk)] = 0; - // hash - MD5(hData); - idx64 = hData[SHIDX(1)]; - idx64 = (idx64<<32) | hData[SHIDX(0)]; - idx64 += reduceOffset + nPos; - uiDiv64 = __umul64hi(idx64, rPlainSpaceTotal); - idx64 -= uiDiv64*plainSpaceTotal; - if(idx64 >= plainSpaceTotal) - idx64 -= plainSpaceTotal; - - if(cpcheck < plCpPosSize && nPos == plCpPos[cpcheck]) { - checkpoint |= ((unsigned int)idx64&1) << cpcheck; - cpcheck++; - } +#define RTGEN_EPILOGUE \ + idx64 = hData[SHIDX(1)]; \ + idx64 = (idx64<<32) | hData[SHIDX(0)]; \ + idx64 += reduceOffset + nPos; \ + uiDiv64 = __umul64hi(idx64, rPlainSpaceTotal); \ + idx64 -= uiDiv64*plainSpaceTotal; \ + if(idx64 >= plainSpaceTotal) \ + idx64 -= plainSpaceTotal; \ + \ + if(cpcheck < plCpPosSize && nPos == plCpPos[cpcheck]) { \ + checkpoint |= ((unsigned int)idx64&1) << cpcheck; \ + cpcheck++; \ + } \ + } \ + \ + hData[SHIDX(0)] = idx64; \ + hData[SHIDX(1)] = idx64>>32; \ + hData[SHIDX(2)] = (cpcheck<<16)|(checkpoint&0x0000ffff); \ + __syncthreads(); \ + \ + if(threadIdx.x == 0) { \ + nPos = ((((blockIdx.y<>4)<<8)+(ii&15); \ + dataHeap[nPos] = ((uint64)hData[SHIDX(1)]<<32)|(uint64)hData[SHIDX(0)]; \ + dataHeap[nPos+1] = hData[SHIDX(2)]; \ + } \ } - hData[SHIDX(0)] = idx64; - hData[SHIDX(1)] = idx64>>32; - hData[SHIDX(2)] = (cpcheck<<16)|(checkpoint&0x0000ffff); - __syncthreads(); - - if(threadIdx.x == 0) { - nPos = ((((blockIdx.y<>4)<<8)+(ii&15); - dataHeap[nPos] = ((unsigned __int64)hData[SHIDX(1)]<<32)|(unsigned __int64)hData[SHIDX(0)]; - dataHeap[nPos+1] = hData[SHIDX(2)]; - } - } - __syncthreads(); -} +#include "rcuda_md5.inc" +#include "rcuda_md4.inc" +#include "rcuda_ntlm.inc" +#include "rcuda_sha1.inc" +#include "rcuda_lm.inc" -extern "C" int CalcChainsOnCUDA(const rcuda::RCudaTask* task, unsigned __int64 *resultBuff) { +extern "C" int CalcChainsOnCUDA(const rcuda::RCudaTask* task, uint64 *resultBuff) { cudaError_t cuErr; char buff[PLAIN_MAX_SIZE]; - unsigned __int64 *data; + uint64 *data; unsigned char *stPlain; uint3 *dimVec; unsigned char *charSet; int *cpPos; - unsigned __int64 uiVal64; - time_t tStart, tEnd; - if(task->hash != rcuda::RHASH_MD5) + uint64 uiVal64; + + if(task->charSetSize > CHAR_SET_MAXLEN) + return -1; + + switch(task->hash) { + case rcuda::RHASH_MD5: + case rcuda::RHASH_MD4: + case rcuda::RHASH_NTLM: + case rcuda::RHASH_SHA1: + case rcuda::RHASH_LM: + break; + default: return 0; - + } + memset(buff, 0, PLAIN_MAX_SIZE); - cudaMalloc((void**)&data, task->idxCount*2*sizeof(unsigned __int64)); + cudaMalloc((void**)&data, task->idxCount*2*sizeof(uint64)); cudaMalloc((void**)&stPlain, PLAIN_MAX_SIZE); - cudaMalloc((void**)&dimVec, task->dimVecSize*sizeof(uint3)); - cudaMalloc((void**)&charSet, task->charSetSize); + cudaMalloc((void**)&dimVec, max(task->dimVecSize, PLAIN_MAX_SIZE)*sizeof(uint3)); + cudaMalloc((void**)&charSet, CHAR_SET_MAXLEN); cudaMalloc((void**)&cpPos, task->cpPosSize*sizeof(int)); - cudaMemcpy(data, resultBuff, task->idxCount*2*sizeof(unsigned __int64), cudaMemcpyHostToDevice); + cudaMemcpy(data, resultBuff, task->idxCount*2*sizeof(uint64), cudaMemcpyHostToDevice); cudaMemcpy(stPlain, buff, PLAIN_MAX_SIZE, cudaMemcpyHostToDevice); cudaMemcpy(stPlain, task->stPlain, min(task->stPlainSize, PLAIN_MAX_SIZE), cudaMemcpyHostToDevice); - cudaMemcpy(dimVec, task->dimVec, task->dimVecSize*sizeof(uint3), cudaMemcpyHostToDevice); - cudaMemcpy(charSet, task->charSet, task->charSetSize, cudaMemcpyHostToDevice); + cudaMemcpy(dimVec, task->dimVec, min(task->dimVecSize, PLAIN_MAX_SIZE)*sizeof(uint3), cudaMemcpyHostToDevice); + cudaMemcpy(charSet, task->charSet, min(task->charSetSize, CHAR_SET_MAXLEN), cudaMemcpyHostToDevice); cudaMemcpy(cpPos, task->cpPositions, task->cpPosSize*sizeof(int), cudaMemcpyHostToDevice); cudaMemcpyToSymbol(dataHeap, &data, sizeof(data)); @@ -286,7 +150,7 @@ extern "C" int CalcChainsOnCUDA(const rcuda::RCudaTask* task, unsigned __int64 * cudaMemcpyToSymbol(plCpPosSize, &task->cpPosSize, sizeof(task->cpPosSize)); cudaMemcpyToSymbol(reduceOffset, &task->reduceOffset, sizeof(task->reduceOffset)); cudaMemcpyToSymbol(plainSpaceTotal, &task->plainSpaceTotal, sizeof(task->plainSpaceTotal)); - uiVal64 = (unsigned __int64)-1/task->plainSpaceTotal; + uiVal64 = (uint64)-1/task->plainSpaceTotal; cudaMemcpyToSymbol(rPlainSpaceTotal, &uiVal64, sizeof(uiVal64)); int grSizeX = (task->idxCount-1)/BLOCK_X_SIZE + 1; @@ -294,19 +158,33 @@ extern "C" int CalcChainsOnCUDA(const rcuda::RCudaTask* task, unsigned __int64 * grSizeX = GRID_X_SIZE; dim3 numBlocks(grSizeX, grSizeY); cuErr = cudaSuccess; - tStart = time(NULL); + + cudaSetDeviceFlags(cudaDeviceBlockingSync); for(int idx = 0; idx < task->rainbowChainLen-1 && cuErr == cudaSuccess; idx+=KERN_CHAIN_SIZE) { - RTGenMD5Kernel<<>>(idx, min(idx+KERN_CHAIN_SIZE, task->rainbowChainLen-1)); + switch(task->hash) { + case rcuda::RHASH_MD5: + RTGenMD5Kernel<<>>(idx, min(idx+KERN_CHAIN_SIZE, task->rainbowChainLen-1)); + break; + case rcuda::RHASH_MD4: + RTGenMD4Kernel<<>>(idx, min(idx+KERN_CHAIN_SIZE, task->rainbowChainLen-1)); + break; + case rcuda::RHASH_NTLM: + RTGenNTLMKernel<<>>(idx, min(idx+KERN_CHAIN_SIZE, task->rainbowChainLen-1)); + break; + case rcuda::RHASH_SHA1: + RTGenSHA1Kernel<<>>(idx, min(idx+KERN_CHAIN_SIZE, task->rainbowChainLen-1)); + break; + case rcuda::RHASH_LM: + RTGenLMKernel<<>>(idx, min(idx+KERN_CHAIN_SIZE, task->rainbowChainLen-1)); + break; + } cuErr = cudaGetLastError(); if(cuErr == cudaSuccess) cuErr = cudaThreadSynchronize(); - } - tEnd = time(NULL); - fprintf(stderr, "Kernel run time: %i\n", (tEnd - tStart)); if(cuErr == cudaSuccess) - cudaMemcpy(resultBuff, data, task->idxCount*2*sizeof(unsigned __int64), cudaMemcpyDeviceToHost); + cudaMemcpy(resultBuff, data, task->idxCount*2*sizeof(uint64), cudaMemcpyDeviceToHost); else fprintf(stderr, "Error happened: %d (%s)\n", cuErr, cudaGetErrorString(cuErr)); @@ -324,3 +202,7 @@ extern "C" int GetChainsBufferSize(int minSize) { grSizeX = GRID_X_SIZE; return grSizeX*grSizeY*BLOCK_X_SIZE; } + +extern "C" int SetCudaDevice(int device) { + return cudaSetDevice(device); +}