From 683eeaa26cde6b9faa7600497da5896319583692 Mon Sep 17 00:00:00 2001 From: James Nobis Date: Tue, 9 Nov 2010 22:21:22 -0600 Subject: [PATCH] merge all the algorithms for distrrtgen_cuda clean up and fix for *nix and 64/32 compat TBD boinc_temporary_exit() isn't functional on linux http://bolt.berkeley.edu/trac/changeset/22382 --- .../distrrtgen_cuda/Makefile | 2 +- .../distrrtgen_cuda/distrrtgen.cpp | 73 +-- .../distrrtgen_cuda/rcuda.cu | 352 +++++-------- .../BOINC client apps/distrrtgen_cuda/rcuda.h | 3 +- .../distrrtgen_cuda/rcuda_ext.cpp | 5 +- .../distrrtgen_cuda/rcuda_lm.inc | 479 ++++++++++++++++++ .../distrrtgen_cuda/rcuda_md4.inc | 143 ++++++ .../distrrtgen_cuda/rcuda_md5.inc | 183 +++++++ .../distrrtgen_cuda/rcuda_ntlm.inc | 145 ++++++ .../distrrtgen_cuda/rcuda_sha1.inc | 160 ++++++ 10 files changed, 1271 insertions(+), 274 deletions(-) create mode 100644 BOINC software/BOINC client apps/distrrtgen_cuda/rcuda_lm.inc create mode 100644 BOINC software/BOINC client apps/distrrtgen_cuda/rcuda_md4.inc create mode 100644 BOINC software/BOINC client apps/distrrtgen_cuda/rcuda_md5.inc create mode 100644 BOINC software/BOINC client apps/distrrtgen_cuda/rcuda_ntlm.inc create mode 100644 BOINC software/BOINC client apps/distrrtgen_cuda/rcuda_sha1.inc diff --git a/BOINC software/BOINC client apps/distrrtgen_cuda/Makefile b/BOINC software/BOINC client apps/distrrtgen_cuda/Makefile index dbf0bc4..86eb1fa 100644 --- a/BOINC software/BOINC client apps/distrrtgen_cuda/Makefile +++ b/BOINC software/BOINC client apps/distrrtgen_cuda/Makefile @@ -103,5 +103,5 @@ rcuda_ext.o: rcuda_ext.h rcuda_ext.cpp $(COMMON_API_PATH)/ChainWalkContext.h $(CXX) $(CXXFLAGS) rcuda_ext.cpp rcuda.o: rcuda.h rcuda.cu $(COMMON_API_PATH)/Public.h $(COMMON_API_PATH)/global.h - /usr/local/cuda/bin/nvcc $(DEBUG) -I../../../Common/rt\ api --compile rcuda.cu + /usr/local/cuda/bin/nvcc $(OPTIMIZATION) $(DEBUG) -I../../../Common/rt\ api --compile rcuda.cu # /usr/local/cuda/bin/nvcc -G $(DEBUG) --machine 32 -I../../../Common/rt\ api --compile rcuda.cu diff --git a/BOINC software/BOINC client apps/distrrtgen_cuda/distrrtgen.cpp b/BOINC software/BOINC client apps/distrrtgen_cuda/distrrtgen.cpp index 0d7145d..9ed924e 100644 --- a/BOINC software/BOINC client apps/distrrtgen_cuda/distrrtgen.cpp +++ b/BOINC software/BOINC client apps/distrrtgen_cuda/distrrtgen.cpp @@ -72,6 +72,7 @@ bool early_crash = false; bool early_sleep = false; double cpu_time = 20, comp_result; */ +/* int QuickSortPartition(RainbowChainCP* pChain, int nLow, int nHigh) { int nRandomIndex = nLow + ((uint32)rand() * ((uint32)RAND_MAX + 1) + (uint32)rand()) % (nHigh - nLow + 1); @@ -104,7 +105,7 @@ void QuickSort(RainbowChainCP* pChain, int nLow, int nHigh) QuickSort(pChain, nPivotLoc + 1, nHigh); } } - +*/ int main(int argc, char **argv) { int retval; double fd; @@ -115,16 +116,29 @@ int main(int argc, char **argv) { fprintf(stderr, "boinc_init returned %d\n", retval); exit(retval); } - - - // get size of input file (used to compute fraction done) - // - //file_size(input_path, fsize); - - // See if there's a valid checkpoint file. - // If so seek input file and truncate output file - // + // extract a --device option + std::vector argVec; + int cudaDevice = -1; + for(int ii = 0; ii < argc; ii++) { + if(cudaDevice < 0 && strcmp(argv[ii], "--device") == 0 && ii + 1 < argc) + cudaDevice = atoi(argv[++ii]); + else + argVec.push_back(argv[ii]); + } + argc = (int)argVec.size(); + argv = &argVec[0]; + if(!(cudaDevice < 0)) + // set the cuda device + if(rcuda::SetCudaDevice(cudaDevice) != 0) + { + //XXX this call doesn't work on linux + // fixed in upstream source 2010-09-16 + // http://bolt.berkeley.edu/trac/changeset/22382 + #ifdef _WIN32 + boinc_temporary_exit(60); + #endif + } if(argc < 10) { @@ -210,9 +224,7 @@ int main(int argc, char **argv) { // Open file -// fclose(fopen(sFilename.c_str(), "a")); -// FILE* file = fopen(sFilename.c_str(), "r+b"); - boinc_resolve_filename("result", output_path, sizeof(output_path)); + boinc_resolve_filename("result", output_path, sizeof(output_path)); fclose(boinc_fopen(output_path, "a")); FILE *outfile = boinc_fopen(output_path, "r+b"); @@ -228,8 +240,8 @@ int main(int argc, char **argv) { unsigned int nFileLen; // Round to boundary - nDataLen = nDataLen / 18 * 18; - if (nDataLen == nRainbowChainCount * 18) + nDataLen = nDataLen / 10 * 10; + if (nDataLen == nRainbowChainCount * 10) { std::cerr << "precomputation of this rainbow table already finished" << std::endl; fclose(outfile); @@ -237,12 +249,14 @@ int main(int argc, char **argv) { } fseek(outfile, nDataLen, SEEK_SET); + //XXX size_t isn't 32/64 clean size_t nReturn; CChainWalkContext cwc; uint64 nIndex[2]; + time_t tStart = time(NULL); // std::cout << "Starting to generate chains" << std::endl; - int maxCalcBuffSize = rcuda::GetChainsBufferSize(5000); + int maxCalcBuffSize = rcuda::GetChainsBufferSize(0x2000); uint64 *calcBuff = new uint64[2*maxCalcBuffSize]; int ii; @@ -250,10 +264,8 @@ int main(int argc, char **argv) { rcuda::RCudaTask cuTask; std::vector stPlain; ex.Init(); -time_t tStart, tStartFinal, tEndFinal; -time_t tEnd; - tStartFinal = time(NULL); - for(uint32 nCurrentCalculatedChains = nDataLen / 18, calcSize; nCurrentCalculatedChains < nRainbowChainCount; ) + + for(int nCurrentCalculatedChains = nDataLen / 10, calcSize; nCurrentCalculatedChains < nRainbowChainCount; ) { fd = (double)nCurrentCalculatedChains / (double)nRainbowChainCount; boinc_fraction_done(fd); @@ -276,21 +288,16 @@ time_t tEnd; calcBuff[2*ii] = cuTask.startIdx + ii; calcBuff[2*ii+1] = 0; } - - tStart = time(NULL); - calcSize = rcuda::CalcChainsOnCUDA(&cuTask, calcBuff); - tEnd = time(NULL); - std::cerr << "CUDA time taken: " << tEnd - tStart << std::endl; - tStart = time(NULL); + if(calcSize > 0) { nCurrentCalculatedChains += calcSize; for(ii = 0; ii < cuTask.idxCount; ii++) { nIndex[0] = cuTask.startIdx + ii; - nReturn = fwrite(nIndex, 1, 8, outfile); - nReturn += fwrite(calcBuff+(2*ii), 1, 8, outfile); +// nReturn = fwrite(nIndex, 1, 8, outfile); + nReturn = fwrite(calcBuff+(2*ii), 1, 8, outfile); nReturn += fwrite(calcBuff+(2*ii+1), 1, 2, outfile); - if(nReturn != 18) { + if(nReturn != 10) { std::cerr << "disk write fail" << std::endl; fclose(outfile); return 9; @@ -302,12 +309,11 @@ time_t tEnd; return 0; } } - tEndFinal = time(NULL); - std::cerr << "Time taken: " << tEndFinal - tStartFinal << " secs" << std::endl; delete [] calcBuff; #ifdef _DEBUG std::cout << "Generation completed" << std::endl; #endif +/* fseek(outfile, 0, SEEK_SET); nFileLen = GetFileLen(outfile); nRainbowChainCount = nFileLen / 18; @@ -347,7 +353,7 @@ time_t tEnd; } delete[] pChain; } - +*/ fclose(outfile); // main loop - read characters, convert to UC, write @@ -368,6 +374,3 @@ int WINAPI WinMain(HINSTANCE hInst, HINSTANCE hPrevInst, LPSTR Args, int WinMode return main(argc, argv); } #endif - -const char *BOINC_RCSID_33ac47a071 = "$Id: upper_case.C 12135 2007-02-21 20:04:14Z davea $"; - diff --git a/BOINC software/BOINC client apps/distrrtgen_cuda/rcuda.cu b/BOINC software/BOINC client apps/distrrtgen_cuda/rcuda.cu index 187b105..b626041 100644 --- a/BOINC software/BOINC client apps/distrrtgen_cuda/rcuda.cu +++ b/BOINC software/BOINC client apps/distrrtgen_cuda/rcuda.cu @@ -1,128 +1,26 @@ +//============================================================================ +// Name : rcuda.cu +// Author : Jan Kyska +// Version : 1.00 +// Description : Generator of FreeRainbowTables / MD5, MD4, NTLM, SHA1, LM +//============================================================================ + #include #include #include "rcuda.h" #define GRID_X_L2 6 #define GRID_Y_L2 6 -#define BLOCK_X_L2 7 +#define BLOCK_X_L2 6 #define GRID_X_SIZE (1<>(32-r[i])); - ret += b; - return ret; -} - -__device__ unsigned int GG(unsigned int a, unsigned int b, unsigned int c, unsigned int d, int i, const unsigned int* data) { - unsigned int ret; - ret = a + ((b&d)|(c&(~d))) + ac[i] + data[SHIDX(g[i])]; - ret = (ret<>(32-r[i])); - ret += b; - return ret; -} - -__device__ unsigned int HH(unsigned int a, unsigned int b, unsigned int c, unsigned int d, int i, const unsigned int* data) { - unsigned int ret; - ret = a + (b^c^d) + ac[i] + data[SHIDX(g[i])]; - ret = (ret<>(32-r[i])); - ret += b; - return ret; -} - -__device__ unsigned int II(unsigned int a, unsigned int b, unsigned int c, unsigned int d, int i, const unsigned int* data) { - unsigned int ret; - ret = a + (c^(b|(~d))) + ac[i] + data[SHIDX(g[i])]; - ret = (ret<>(32-r[i])); - ret += b; - return ret; -} - -__device__ void MD5(unsigned int* dataHash) { - unsigned int a = h[0], b = h[1], c = h[2], d = h[3], x; - int ii; - - // Round 1 - for(ii = 0; ii < 16; ii++) { - x = b; - b = FF(a, b, c, d, ii, dataHash); - a = d; d = c; c = x; - } - - // Round 2 - for(; ii < 32; ii++) { - x = b; - b = GG(a, b, c, d, ii, dataHash); - a = d; d = c; c = x; - } - - // Round 3 - for(; ii < 48; ii++) { - x = b; - b = HH(a, b, c, d, ii, dataHash); - a = d; d = c; c = x; - } - - // Round 4 - for(; ii < 64; ii++) { - x = b; - b = II(a, b, c, d, ii, dataHash); - a = d; d = c; c = x; - } - - dataHash[SHIDX(0)] = a + h[0]; - dataHash[SHIDX(1)] = b + h[1]; - dataHash[SHIDX(2)] = c + h[2]; - dataHash[SHIDX(3)] = d + h[3]; -} - - -#define PLAIN_MAX_SIZE 20 - __device__ uint64 *dataHeap; __device__ unsigned char *plStart; __device__ uint3 *plDimVec; @@ -133,123 +31,78 @@ __device__ uint64 reduceOffset; __device__ uint64 plainSpaceTotal; __device__ uint64 rPlainSpaceTotal; - -__global__ void RTGenMD5Kernel(unsigned int chainStart, unsigned int chainStop) { - unsigned int *hData; - uint3 dimItem; - unsigned int uiVal, uiMul, uiDiv, idx; - uint64 uiVal64, uiMul64, uiDiv64, idx64; - unsigned int nPos, size, ii, jj, kk; - unsigned int cpcheck, checkpoint; - unsigned int plain; - - __shared__ unsigned int shData[SHIDX(BLOCK_X_SIZE)]; - - if(threadIdx.x == 0) { - nPos = ((((blockIdx.y<>4)<<8)+(ii&15); - hData[SHIDX(0)] = dataHeap[nPos]; - hData[SHIDX(1)] = dataHeap[nPos]>>32; - hData[SHIDX(2)] = dataHeap[nPos+1]; - } - } - __syncthreads(); - - hData = shData + ((threadIdx.x>>4)<<8)+(threadIdx.x&15); - - idx64 = hData[SHIDX(1)]; - idx64 = (idx64<<32) | hData[SHIDX(0)]; - cpcheck = hData[SHIDX(2)]; - checkpoint = cpcheck&0x0000ffff; - cpcheck = cpcheck>>16; - +#define RTGEN_PROLOGUE \ + unsigned int *hData; \ + unsigned int idx; \ + uint64 idx64; \ + unsigned int nPos, ii; \ + unsigned int cpcheck, checkpoint; \ + \ + __shared__ unsigned int shData[SHIDX(BLOCK_X_SIZE)]; \ + __shared__ unsigned char cplChrSet[CHAR_SET_MAXLEN]; \ + __shared__ unsigned char cplStart[PLAIN_MAX_SIZE]; \ + __shared__ uint3 cplDimVec[PLAIN_MAX_SIZE]; \ + \ + if(threadIdx.x == 0) { \ + nPos = ((((blockIdx.y<>4)<<8)+(ii&15); \ + hData[SHIDX(0)] = dataHeap[nPos]; \ + hData[SHIDX(1)] = dataHeap[nPos]>>32; \ + hData[SHIDX(2)] = dataHeap[nPos+1]; \ + } \ + memcpy(cplChrSet, plChrSet, CHAR_SET_MAXLEN); \ + memcpy(cplStart, plStart, PLAIN_MAX_SIZE); \ + memcpy(cplDimVec, plDimVec, PLAIN_MAX_SIZE*sizeof(uint3)); \ + } \ + __syncthreads(); \ + \ + hData = shData + ((threadIdx.x>>4)<<8)+(threadIdx.x&15); \ + \ + idx64 = hData[SHIDX(1)]; \ + idx64 = (idx64<<32) | hData[SHIDX(0)]; \ + cpcheck = hData[SHIDX(2)]; \ + checkpoint = cpcheck&0x0000ffff; \ + cpcheck = cpcheck>>16; \ + \ for(nPos = chainStart; nPos < chainStop; nPos++) { - // transform to the plain text - plain = 0x80; - jj = (PLAIN_MAX_SIZE>>2)+1; - for(ii = 0; idx64 > 0xfffffff0ull && ii < PLAIN_MAX_SIZE; ii++) { - uiVal64 = idx64 + plStart[ii]; - uiVal64--; - dimItem = plDimVec[ii]; - uiMul64 = (uint64)dimItem.y<<32; - idx64 = __umul64hi(uiVal64, uiMul64); - uiDiv64 = uiVal64 - idx64*(uint64)dimItem.x; - uiVal = __umulhi((unsigned int)uiDiv64, dimItem.y); - uiDiv = (unsigned int)uiDiv64 - uiVal * dimItem.x; - idx64 += uiVal; - if(uiDiv >= dimItem.x) { - uiDiv -= dimItem.x; - idx64++; - } - plain = (plain<<8) | plChrSet[dimItem.z + uiDiv]; - if((ii&3) == 2) { - hData[SHIDX(jj--)] = plain; - plain = 0; - } - } - for(idx = (unsigned int)idx64; idx != 0 && ii < PLAIN_MAX_SIZE; ii++) { - uiVal = idx + plStart[ii]; - uiVal--; - dimItem = plDimVec[ii]; - idx = __umulhi(uiVal, dimItem.y); - uiDiv = uiVal - idx*dimItem.x; - if(uiDiv >= dimItem.x) { - uiDiv -= dimItem.x; - idx++; - } - plain = (plain<<8) | plChrSet[dimItem.z + uiDiv]; - if((ii&3) == 2) { - hData[SHIDX(jj--)] = plain; - plain = 0; - } - } - - // prepare for MD5 - size = ii; - ii = ((((3-(ii&3))<<3)-1)&0x1f)+1; - plain = plain<>2)+1; plain = hData[SHIDX(jj++)], kk++) - hData[SHIDX(kk)] = (plain>>ii)|(hData[SHIDX(jj)]<<(32-ii)); - hData[SHIDX(kk)] = plain>>ii; - for(kk++; kk < 14; kk++) - hData[SHIDX(kk)] = 0; - hData[SHIDX(kk++)] = size<<3; - hData[SHIDX(kk)] = 0; - // hash - MD5(hData); - idx64 = hData[SHIDX(1)]; - idx64 = (idx64<<32) | hData[SHIDX(0)]; - idx64 += reduceOffset + nPos; - uiDiv64 = __umul64hi(idx64, rPlainSpaceTotal); - idx64 -= uiDiv64*plainSpaceTotal; - if(idx64 >= plainSpaceTotal) - idx64 -= plainSpaceTotal; - - if(cpcheck < plCpPosSize && nPos == plCpPos[cpcheck]) { - checkpoint |= ((unsigned int)idx64&1) << cpcheck; - cpcheck++; - } +#define RTGEN_EPILOGUE \ + idx64 = hData[SHIDX(1)]; \ + idx64 = (idx64<<32) | hData[SHIDX(0)]; \ + idx64 += reduceOffset + nPos; \ + uiDiv64 = __umul64hi(idx64, rPlainSpaceTotal); \ + idx64 -= uiDiv64*plainSpaceTotal; \ + if(idx64 >= plainSpaceTotal) \ + idx64 -= plainSpaceTotal; \ + \ + if(cpcheck < plCpPosSize && nPos == plCpPos[cpcheck]) { \ + checkpoint |= ((unsigned int)idx64&1) << cpcheck; \ + cpcheck++; \ + } \ + } \ + \ + hData[SHIDX(0)] = idx64; \ + hData[SHIDX(1)] = idx64>>32; \ + hData[SHIDX(2)] = (cpcheck<<16)|(checkpoint&0x0000ffff); \ + __syncthreads(); \ + \ + if(threadIdx.x == 0) { \ + nPos = ((((blockIdx.y<>4)<<8)+(ii&15); \ + dataHeap[nPos] = ((uint64)hData[SHIDX(1)]<<32)|(uint64)hData[SHIDX(0)]; \ + dataHeap[nPos+1] = hData[SHIDX(2)]; \ + } \ } - hData[SHIDX(0)] = idx64; - hData[SHIDX(1)] = idx64>>32; - hData[SHIDX(2)] = (cpcheck<<16)|(checkpoint&0x0000ffff); - __syncthreads(); - - if(threadIdx.x == 0) { - nPos = ((((blockIdx.y<>4)<<8)+(ii&15); - dataHeap[nPos] = ((uint64)hData[SHIDX(1)]<<32)|(uint64)hData[SHIDX(0)]; - dataHeap[nPos+1] = hData[SHIDX(2)]; - } - } - __syncthreads(); -} +#include "rcuda_md5.inc" +#include "rcuda_md4.inc" +#include "rcuda_ntlm.inc" +#include "rcuda_sha1.inc" +#include "rcuda_lm.inc" extern "C" int CalcChainsOnCUDA(const rcuda::RCudaTask* task, uint64 *resultBuff) { cudaError_t cuErr; @@ -260,22 +113,33 @@ extern "C" int CalcChainsOnCUDA(const rcuda::RCudaTask* task, uint64 *resultBuff unsigned char *charSet; int *cpPos; uint64 uiVal64; - time_t tStart, tEnd; - if(task->hash != rcuda::RHASH_MD5) - return 0; + if(task->charSetSize > CHAR_SET_MAXLEN) + return -1; + + switch(task->hash) { + case rcuda::RHASH_MD5: + case rcuda::RHASH_MD4: + case rcuda::RHASH_NTLM: + case rcuda::RHASH_SHA1: + case rcuda::RHASH_LM: + break; + default: + return 0; + } + memset(buff, 0, PLAIN_MAX_SIZE); cudaMalloc((void**)&data, task->idxCount*2*sizeof(uint64)); cudaMalloc((void**)&stPlain, PLAIN_MAX_SIZE); - cudaMalloc((void**)&dimVec, task->dimVecSize*sizeof(uint3)); - cudaMalloc((void**)&charSet, task->charSetSize); + cudaMalloc((void**)&dimVec, max(task->dimVecSize, PLAIN_MAX_SIZE)*sizeof(uint3)); + cudaMalloc((void**)&charSet, CHAR_SET_MAXLEN); cudaMalloc((void**)&cpPos, task->cpPosSize*sizeof(int)); cudaMemcpy(data, resultBuff, task->idxCount*2*sizeof(uint64), cudaMemcpyHostToDevice); cudaMemcpy(stPlain, buff, PLAIN_MAX_SIZE, cudaMemcpyHostToDevice); cudaMemcpy(stPlain, task->stPlain, min(task->stPlainSize, PLAIN_MAX_SIZE), cudaMemcpyHostToDevice); - cudaMemcpy(dimVec, task->dimVec, task->dimVecSize*sizeof(uint3), cudaMemcpyHostToDevice); - cudaMemcpy(charSet, task->charSet, task->charSetSize, cudaMemcpyHostToDevice); + cudaMemcpy(dimVec, task->dimVec, min(task->dimVecSize, PLAIN_MAX_SIZE)*sizeof(uint3), cudaMemcpyHostToDevice); + cudaMemcpy(charSet, task->charSet, min(task->charSetSize, CHAR_SET_MAXLEN), cudaMemcpyHostToDevice); cudaMemcpy(cpPos, task->cpPositions, task->cpPosSize*sizeof(int), cudaMemcpyHostToDevice); cudaMemcpyToSymbol(dataHeap, &data, sizeof(data)); @@ -294,16 +158,30 @@ extern "C" int CalcChainsOnCUDA(const rcuda::RCudaTask* task, uint64 *resultBuff grSizeX = GRID_X_SIZE; dim3 numBlocks(grSizeX, grSizeY); cuErr = cudaSuccess; - tStart = time(NULL); + + cudaSetDeviceFlags(cudaDeviceBlockingSync); for(int idx = 0; idx < task->rainbowChainLen-1 && cuErr == cudaSuccess; idx+=KERN_CHAIN_SIZE) { - RTGenMD5Kernel<<>>(idx, min(idx+KERN_CHAIN_SIZE, task->rainbowChainLen-1)); + switch(task->hash) { + case rcuda::RHASH_MD5: + RTGenMD5Kernel<<>>(idx, min(idx+KERN_CHAIN_SIZE, task->rainbowChainLen-1)); + break; + case rcuda::RHASH_MD4: + RTGenMD4Kernel<<>>(idx, min(idx+KERN_CHAIN_SIZE, task->rainbowChainLen-1)); + break; + case rcuda::RHASH_NTLM: + RTGenNTLMKernel<<>>(idx, min(idx+KERN_CHAIN_SIZE, task->rainbowChainLen-1)); + break; + case rcuda::RHASH_SHA1: + RTGenSHA1Kernel<<>>(idx, min(idx+KERN_CHAIN_SIZE, task->rainbowChainLen-1)); + break; + case rcuda::RHASH_LM: + RTGenLMKernel<<>>(idx, min(idx+KERN_CHAIN_SIZE, task->rainbowChainLen-1)); + break; + } cuErr = cudaGetLastError(); if(cuErr == cudaSuccess) cuErr = cudaThreadSynchronize(); - } - tEnd = time(NULL); - fprintf(stderr, "Kernel run time: %i\n", (tEnd - tStart)); if(cuErr == cudaSuccess) cudaMemcpy(resultBuff, data, task->idxCount*2*sizeof(uint64), cudaMemcpyDeviceToHost); @@ -324,3 +202,7 @@ extern "C" int GetChainsBufferSize(int minSize) { grSizeX = GRID_X_SIZE; return grSizeX*grSizeY*BLOCK_X_SIZE; } + +extern "C" int SetCudaDevice(int device) { + return cudaSetDevice(device); +} diff --git a/BOINC software/BOINC client apps/distrrtgen_cuda/rcuda.h b/BOINC software/BOINC client apps/distrrtgen_cuda/rcuda.h index 370b064..c641eda 100644 --- a/BOINC software/BOINC client apps/distrrtgen_cuda/rcuda.h +++ b/BOINC software/BOINC client apps/distrrtgen_cuda/rcuda.h @@ -24,8 +24,9 @@ struct RCudaTask { unsigned int rainbowChainLen; }; -extern "C" int CalcChainsOnCUDA(const RCudaTask* task, uint64 *resultBuff); +extern "C" int SetCudaDevice(int device); extern "C" int GetChainsBufferSize(int minSize); +extern "C" int CalcChainsOnCUDA(const RCudaTask* task, uint64 *resultBuff); } diff --git a/BOINC software/BOINC client apps/distrrtgen_cuda/rcuda_ext.cpp b/BOINC software/BOINC client apps/distrrtgen_cuda/rcuda_ext.cpp index 74b8d9e..ea78a0d 100644 --- a/BOINC software/BOINC client apps/distrrtgen_cuda/rcuda_ext.cpp +++ b/BOINC software/BOINC client apps/distrrtgen_cuda/rcuda_ext.cpp @@ -1,7 +1,7 @@ //============================================================================ // Name : rcuda_ext.cpp // Author : Jan Kyska -// Version : 0.9 +// Version : 1.00 // Description : A code to access internals of the CChainWalkContext // for the CUDA generator of FreeRainbowTables //============================================================================ @@ -76,7 +76,8 @@ int CudaCWCExtender::IndexToStartPlain(const uint64 nIndex, std::vector> n ) ^ data[ib]) & m; + data[ib] ^= data[it]; + data[ia] ^= data[it] << n; +} + +__device__ void HPERM_OP(int ia, int it, int n, unsigned int m, unsigned int* data) { + data[it] = ((data[ia] << (16-n)) ^ data[ia]) & m; + data[ia] = data[ia] ^ data[it] ^ (data[it]>>(16-n)); +} + +__device__ void IP(int il, int ir, int it, unsigned int* data) { + PERM_OP(ir, il, it, 4, 0x0f0f0f0f, data); + PERM_OP(il, ir, it, 16, 0x0000ffff, data); + PERM_OP(ir, il, it, 2, 0x33333333, data); + PERM_OP(il, ir, it, 8, 0x00ff00ff, data); + PERM_OP(ir, il, it, 1, 0x55555555, data); +} + +__device__ void FP(int il, int ir, int it, unsigned int* data) { + PERM_OP(il, ir, it, 1, 0x55555555, data); + PERM_OP(ir, il, it, 8, 0x00ff00ff, data); + PERM_OP(il, ir, it, 2, 0x33333333, data); + PERM_OP(ir, il, it, 16, 0x0000ffff, data); + PERM_OP(il, ir, it, 4, 0x0f0f0f0f, data); +} + +__device__ unsigned int D_ENCRYPT(unsigned int ll, unsigned int uu, unsigned int tt) { + tt = (tt>>4)|(tt<<28); + return ll ^ des_SPtrans[0][(uu>>2)&0x3f] ^ + des_SPtrans[2][(uu>>10)&0x3f] ^ + des_SPtrans[4][(uu>>18)&0x3f] ^ + des_SPtrans[6][(uu>>26)&0x3f] ^ + des_SPtrans[1][(tt>>2)&0x3f] ^ + des_SPtrans[3][(tt>>10)&0x3f] ^ + des_SPtrans[5][(tt>>18)&0x3f] ^ + des_SPtrans[7][(tt>>26)&0x3f]; +} + +} + +__global__ void RTGenLMKernel(unsigned int chainStart, unsigned int chainStop) { + uint3 dimItem; + unsigned int *hData2, *hData3; + uint64 uiDiv64, uiVal64, uiMul64; + unsigned int uiVal, uiDiv; + unsigned int jj, rs, rt; + + __shared__ unsigned int shData2[SHIDX(BLOCK_X_SIZE)]; + __shared__ unsigned int shData3[BLOCK_X_SIZE<<2]; + hData2 = shData2 + ((threadIdx.x>>4)<<8)+(threadIdx.x&15); + hData3 = shData3 + ((threadIdx.x>>4)<<6)+(threadIdx.x&15); + + RTGEN_PROLOGUE; + + // transform to the plain text + for(ii = 0; ii < 8; ii++) + hData[SHIDX(ii)] = 0; + + for(ii = 0; idx64 > 0xfffffff0ull && ii < PLAIN_MAX_SIZE; ii++) { + uiVal64 = idx64 + cplStart[ii]; + uiVal64--; + dimItem = cplDimVec[ii]; + + uiMul64 = (uint64)dimItem.y<<32; + idx64 = __umul64hi(uiVal64, uiMul64); + uiDiv64 = uiVal64 - idx64*(uint64)dimItem.x; + uiVal = __umulhi((unsigned int)uiDiv64, dimItem.y); + uiDiv = (unsigned int)uiDiv64 - uiVal * dimItem.x; + idx64 += uiVal; + if(uiDiv >= dimItem.x) { + uiDiv -= dimItem.x; + idx64++; + } + hData[SHIDX(ii&7)] = cplChrSet[dimItem.z + uiDiv]; + } + + for(idx = (unsigned int)idx64; idx != 0 && ii < PLAIN_MAX_SIZE; ii++) { + uiVal = idx + cplStart[ii]; + uiVal--; + dimItem = cplDimVec[ii]; + + idx = __umulhi(uiVal, dimItem.y); + uiDiv = uiVal - idx*dimItem.x; + if(uiDiv >= dimItem.x) { + uiDiv -= dimItem.x; + idx++; + } + hData[SHIDX(ii&7)] = cplChrSet[dimItem.z + uiDiv]; + } + + for(jj = 8, ii--; jj < 15; jj++, ii--) + hData[SHIDX(jj)] = hData[SHIDX(ii&7)]; + + // set key + ii = 255; + uiVal = ((hData[SHIDX(10)] << 5) | (hData[SHIDX(11)] >> 3))ⅈ + uiVal = (uiVal<<8) | (((hData[SHIDX(9)] << 6) | (hData[SHIDX(10)] >> 2))&ii); + uiVal = (uiVal<<8) | (((hData[SHIDX(8)] << 7) | (hData[SHIDX(9)] >> 1))&ii); + uiVal = (uiVal<<8) | hData[SHIDX(8)]; + + uiDiv = (hData[SHIDX(14)] << 1)ⅈ + uiDiv = (uiDiv<<8) | (((hData[SHIDX(13)] << 2) | (hData[SHIDX(14)] >> 6))&ii); + uiDiv = (uiDiv<<8) | (((hData[SHIDX(12)] << 3) | (hData[SHIDX(13)] >> 5))&ii); + uiDiv = (uiDiv<<8) | (((hData[SHIDX(11)] << 4) | (hData[SHIDX(12)] >> 4))&ii); + + hData[SHIDX(0)] = uiVal; + hData[SHIDX(1)] = uiDiv; + RC_LM::PERM_OP(SHIDX(1), SHIDX(0), SHIDX(2), 4, 0x0f0f0f0f, hData); + RC_LM::HPERM_OP(SHIDX(0), SHIDX(2), -2, 0xcccc0000, hData); + RC_LM::HPERM_OP(SHIDX(1), SHIDX(2), -2, 0xcccc0000, hData); + RC_LM::PERM_OP(SHIDX(1), SHIDX(0), SHIDX(2), 1, 0x55555555, hData); + RC_LM::PERM_OP(SHIDX(0), SHIDX(1), SHIDX(2), 8, 0x00ff00ff, hData); + RC_LM::PERM_OP(SHIDX(1), SHIDX(0), SHIDX(2), 1, 0x55555555, hData); + uiVal = hData[SHIDX(0)]; + uiDiv = hData[SHIDX(1)]; + uiDiv = ((uiDiv&0x000000ff)<<16) | (uiDiv&0x0000ff00) | ((uiDiv&0x00ff0000)>>16) | ((uiVal&0xf0000000)>>4); + uiVal &= 0x0fffffff; + + for(ii = 0; ii < ITERATIONS; ii++) { + if(RC_LM::shifts2[ii]) { + uiVal = ((uiVal>>2)|(uiVal<<26)); + uiDiv =((uiDiv>>2)|(uiDiv<<26)); + } else { + uiVal = ((uiVal>>1)|(uiVal<<27)); + uiDiv = ((uiDiv>>1)|(uiDiv<<27)); + } + uiVal &= 0x0fffffff; + uiDiv &= 0x0fffffff; + + rs = RC_LM::des_skb[0][uiVal&0x3f] | + RC_LM::des_skb[1][((uiVal>>6)&0x03)|((uiVal>>7)&0x3c)] | + RC_LM::des_skb[2][((uiVal>>13)&0x0f)|((uiVal>>14)&0x30)] | + RC_LM::des_skb[3][((uiVal>>20)&0x01)|((uiVal>>21)&0x06) | + ((uiVal>>22)&0x38)]; + rt = RC_LM::des_skb[4][uiDiv&0x3f] | + RC_LM::des_skb[5][((uiDiv>>7)&0x03)|((uiDiv>>8)&0x3c)] | + RC_LM::des_skb[6][(uiDiv>>15)&0x3f] | + RC_LM::des_skb[7][((uiDiv>>21)&0x0f)|((uiDiv>>22)&0x30)]; + + /* table contained 0213 4657 */ + idx = (rt<<16)|(rs&0x0000ffff); + hData[SHIDX(ii)] = (idx>>30)|(idx<<2); + idx = (rs>>16)|(rt&0xffff0000); + hData2[SHIDX(ii)] = (idx>>26)|(idx<<6); + } + + // encrypt the "magic" data + hData3[SHIDX(0)] = 0x2153474B; + hData3[SHIDX(1)] = 0x25242340; + + RC_LM::IP(SHIDX(0), SHIDX(1), SHIDX(2), hData3); + uiVal = hData3[SHIDX(0)]; + uiVal = ((uiVal>>29)|(uiVal<<3)); + uiDiv = hData3[SHIDX(1)]; + uiDiv = ((uiDiv>>29)|(uiDiv<<3)); + + for(ii = 0; ii < 16; ii+=2) { + uiDiv = RC_LM::D_ENCRYPT(uiDiv, uiVal^hData[SHIDX(ii)], uiVal^hData2[SHIDX(ii)]); + uiVal = RC_LM::D_ENCRYPT(uiVal, uiDiv^hData[SHIDX(ii+1)], uiDiv^hData2[SHIDX(ii+1)]); + } + + hData3[SHIDX(0)] = ((uiVal>>3)|(uiVal<<29)); + hData3[SHIDX(1)] = ((uiDiv>>3)|(uiDiv<<29)); + RC_LM::FP(SHIDX(0), SHIDX(1), SHIDX(2), hData3); + + hData[SHIDX(0)] = hData3[SHIDX(1)]; + hData[SHIDX(1)] = hData3[SHIDX(0)]; + RTGEN_EPILOGUE +} diff --git a/BOINC software/BOINC client apps/distrrtgen_cuda/rcuda_md4.inc b/BOINC software/BOINC client apps/distrrtgen_cuda/rcuda_md4.inc new file mode 100644 index 0000000..362e608 --- /dev/null +++ b/BOINC software/BOINC client apps/distrrtgen_cuda/rcuda_md4.inc @@ -0,0 +1,143 @@ +//============================================================================ +// Name : rcuda_md4.inc +// Author : Jan Kyska +// Version : 1.00 +// Description : MD4 hash kernel for Generator of FreeRainbowTables +//============================================================================ + +namespace RC_MD4 { + +__device__ __constant__ unsigned int h[4] = { 0x67452301, 0xEFCDAB89, 0x98BADCFE, 0x10325476 }; +__device__ __constant__ unsigned char r[48] = { \ + 3, 7, 11, 19, 3, 7, 11, 19, 3, 7, 11, 19, 3, 7, 11, 19, \ + 3, 5, 9, 13, 3, 5, 9, 13, 3, 5, 9, 13, 3, 5, 9, 13, \ + 3, 9, 11, 15, 3, 9, 11, 15, 3, 9, 11, 15, 3, 9, 11, 15 }; +__device__ __constant__ unsigned char g[48] = { \ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \ + 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, \ + 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15 }; + +__device__ unsigned int FF(unsigned int a, unsigned int b, unsigned int c, unsigned int d, int i, const unsigned int* data) { + unsigned int ret; + ret = a + ((b&c)|((~b)&d)) + data[SHIDX(g[i])]; + ret = (ret<>(32-r[i])); + return ret; +} + +__device__ unsigned int GG(unsigned int a, unsigned int b, unsigned int c, unsigned int d, int i, const unsigned int* data) { + unsigned int ret; + ret = a + ((b&c)|(b&d)|(c&d)) + data[SHIDX(g[i])] + 0x5a827999u; + ret = (ret<>(32-r[i])); + return ret; +} + +__device__ unsigned int HH(unsigned int a, unsigned int b, unsigned int c, unsigned int d, int i, const unsigned int* data) { + unsigned int ret; + ret = a + (b^c^d) + data[SHIDX(g[i])] + 0x6ed9eba1u; + ret = (ret<>(32-r[i])); + return ret; +} + +__device__ void MD4(unsigned int* dataHash) { + unsigned int a = h[0], b = h[1], c = h[2], d = h[3], x; + int ii; + + // Round 1 + for(ii = 0; ii < 16; ii++) { + x = b; + b = FF(a, b, c, d, ii, dataHash); + a = d; d = c; c = x; + } + + // Round 2 + for(; ii < 32; ii++) { + x = b; + b = GG(a, b, c, d, ii, dataHash); + a = d; d = c; c = x; + } + + // Round 3 + for(; ii < 48; ii++) { + x = b; + b = HH(a, b, c, d, ii, dataHash); + a = d; d = c; c = x; + } + + dataHash[SHIDX(0)] = a + h[0]; + dataHash[SHIDX(1)] = b + h[1]; + dataHash[SHIDX(2)] = c + h[2]; + dataHash[SHIDX(3)] = d + h[3]; +} + +} + +__global__ void RTGenMD4Kernel(unsigned int chainStart, unsigned int chainStop) { + uint3 dimItem; + uint64 uiDiv64, uiVal64, uiMul64; + unsigned int uiVal, uiDiv; + unsigned int size, jj; + unsigned int plain; + + RTGEN_PROLOGUE; + + // transform to the plain text + plain = 0x80; + jj = (PLAIN_MAX_SIZE>>2)+1; + + for(ii = 0; idx64 > 0xfffffff0ull && ii < PLAIN_MAX_SIZE; ii++) { + uiVal64 = idx64 + cplStart[ii]; + uiVal64--; + dimItem = cplDimVec[ii]; + + uiMul64 = (uint64)dimItem.y<<32; + idx64 = __umul64hi(uiVal64, uiMul64); + uiDiv64 = uiVal64 - idx64*(uint64)dimItem.x; + uiVal = __umulhi((unsigned int)uiDiv64, dimItem.y); + uiDiv = (unsigned int)uiDiv64 - uiVal * dimItem.x; + idx64 += uiVal; + if(uiDiv >= dimItem.x) { + uiDiv -= dimItem.x; + idx64++; + } + plain = (plain<<8) | cplChrSet[dimItem.z + uiDiv]; + if((ii&3) == 2) { + hData[SHIDX(jj--)] = plain; + plain = 0; + } + } + + for(idx = (unsigned int)idx64; idx != 0 && ii < PLAIN_MAX_SIZE; ii++) { + uiVal = idx + cplStart[ii]; + uiVal--; + dimItem = cplDimVec[ii]; + + idx = __umulhi(uiVal, dimItem.y); + uiDiv = uiVal - idx*dimItem.x; + if(uiDiv >= dimItem.x) { + uiDiv -= dimItem.x; + idx++; + } + plain = (plain<<8) | cplChrSet[dimItem.z + uiDiv]; + if((ii&3) == 2) { + hData[SHIDX(jj--)] = plain; + plain = 0; + } + } + + // prepare for MD4 + size = ii; + ii = ((((3-(ii&3))<<3)-1)&0x1f)+1; + plain = plain<>2)+1; plain = hData[SHIDX(jj++)], idx++) + hData[SHIDX(idx)] = (plain>>ii)|(hData[SHIDX(jj)]<<(32-ii)); + hData[SHIDX(idx)] = plain>>ii; + for(idx++; idx < 14; idx++) + hData[SHIDX(idx)] = 0; + hData[SHIDX(idx++)] = size<<3; + hData[SHIDX(idx)] = 0; + + // hash + RC_MD4::MD4(hData); + + RTGEN_EPILOGUE +} diff --git a/BOINC software/BOINC client apps/distrrtgen_cuda/rcuda_md5.inc b/BOINC software/BOINC client apps/distrrtgen_cuda/rcuda_md5.inc new file mode 100644 index 0000000..77d8696 --- /dev/null +++ b/BOINC software/BOINC client apps/distrrtgen_cuda/rcuda_md5.inc @@ -0,0 +1,183 @@ +//============================================================================ +// Name : rcuda_md5.inc +// Author : Jan Kyska +// Version : 1.00 +// Description : MD5 hash kernel for Generator of FreeRainbowTables +//============================================================================ + +namespace RC_MD5 { + +__device__ __constant__ unsigned int h[4] = { 0x67452301, 0xEFCDAB89, 0x98BADCFE, 0x10325476 }; +__device__ __constant__ unsigned char r[64] = { \ + 7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22, \ + 5, 9, 14, 20, 5, 9, 14, 20, 5, 9, 14, 20, 5, 9, 14, 20, \ + 4, 11, 16, 23, 4, 11, 16, 23, 4, 11, 16, 23, 4, 11, 16, 23, \ + 6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, 21 }; +__device__ __constant__ unsigned char g[64] = { \ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \ + 1, 6, 11, 0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, \ + 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, \ + 0, 7, 14, 5, 12, 3, 10, 1, 8, 15, 6, 13, 4, 11, 2, 9 }; +__device__ __constant__ unsigned int ac[64] = { \ + 0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee, \ + 0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501, \ + 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be, \ + 0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821, \ + \ + 0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa, \ + 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8, \ + 0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed, \ + 0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a, \ + \ + 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c, \ + 0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70, \ + 0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05, \ + 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665, \ + \ + 0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039, \ + 0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1, \ + 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1, \ + 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391 }; + +__device__ unsigned int FF(unsigned int a, unsigned int b, unsigned int c, unsigned int d, int i, const unsigned int* data) { + unsigned int ret; + ret = a + ((b&c)|((~b)&d)) + ac[i] + data[SHIDX(g[i])]; + ret = (ret<>(32-r[i])); + ret += b; + return ret; +} + +__device__ unsigned int GG(unsigned int a, unsigned int b, unsigned int c, unsigned int d, int i, const unsigned int* data) { + unsigned int ret; + ret = a + ((b&d)|(c&(~d))) + ac[i] + data[SHIDX(g[i])]; + ret = (ret<>(32-r[i])); + ret += b; + return ret; +} + +__device__ unsigned int HH(unsigned int a, unsigned int b, unsigned int c, unsigned int d, int i, const unsigned int* data) { + unsigned int ret; + ret = a + (b^c^d) + ac[i] + data[SHIDX(g[i])]; + ret = (ret<>(32-r[i])); + ret += b; + return ret; +} + +__device__ unsigned int II(unsigned int a, unsigned int b, unsigned int c, unsigned int d, int i, const unsigned int* data) { + unsigned int ret; + ret = a + (c^(b|(~d))) + ac[i] + data[SHIDX(g[i])]; + ret = (ret<>(32-r[i])); + ret += b; + return ret; +} + +__device__ void MD5(unsigned int* dataHash) { + unsigned int a = h[0], b = h[1], c = h[2], d = h[3], x; + int ii; + + // Round 1 + for(ii = 0; ii < 16; ii++) { + x = b; + b = FF(a, b, c, d, ii, dataHash); + a = d; d = c; c = x; + } + + // Round 2 + for(; ii < 32; ii++) { + x = b; + b = GG(a, b, c, d, ii, dataHash); + a = d; d = c; c = x; + } + + // Round 3 + for(; ii < 48; ii++) { + x = b; + b = HH(a, b, c, d, ii, dataHash); + a = d; d = c; c = x; + } + + // Round 4 + for(; ii < 64; ii++) { + x = b; + b = II(a, b, c, d, ii, dataHash); + a = d; d = c; c = x; + } + + dataHash[SHIDX(0)] = a + h[0]; + dataHash[SHIDX(1)] = b + h[1]; + dataHash[SHIDX(2)] = c + h[2]; + dataHash[SHIDX(3)] = d + h[3]; +} + +} + +__global__ void RTGenMD5Kernel(unsigned int chainStart, unsigned int chainStop) { + uint3 dimItem; + uint64 uiDiv64, uiVal64, uiMul64; + unsigned int uiVal, uiDiv; + unsigned int size, jj; + unsigned int plain; + + RTGEN_PROLOGUE; + + // transform to the plain text + plain = 0x80; + jj = (PLAIN_MAX_SIZE>>2)+1; + + for(ii = 0; idx64 > 0xfffffff0ull && ii < PLAIN_MAX_SIZE; ii++) { + uiVal64 = idx64 + cplStart[ii]; + uiVal64--; + dimItem = cplDimVec[ii]; + + uiMul64 = (uint64)dimItem.y<<32; + idx64 = __umul64hi(uiVal64, uiMul64); + uiDiv64 = uiVal64 - idx64*(uint64)dimItem.x; + uiVal = __umulhi((unsigned int)uiDiv64, dimItem.y); + uiDiv = (unsigned int)uiDiv64 - uiVal * dimItem.x; + idx64 += uiVal; + if(uiDiv >= dimItem.x) { + uiDiv -= dimItem.x; + idx64++; + } + plain = (plain<<8) | cplChrSet[dimItem.z + uiDiv]; + if((ii&3) == 2) { + hData[SHIDX(jj--)] = plain; + plain = 0; + } + } + + for(idx = (unsigned int)idx64; idx != 0 && ii < PLAIN_MAX_SIZE; ii++) { + uiVal = idx + cplStart[ii]; + uiVal--; + dimItem = cplDimVec[ii]; + + idx = __umulhi(uiVal, dimItem.y); + uiDiv = uiVal - idx*dimItem.x; + if(uiDiv >= dimItem.x) { + uiDiv -= dimItem.x; + idx++; + } + plain = (plain<<8) | cplChrSet[dimItem.z + uiDiv]; + if((ii&3) == 2) { + hData[SHIDX(jj--)] = plain; + plain = 0; + } + } + + // prepare for MD5 + size = ii; + ii = ((((3-(ii&3))<<3)-1)&0x1f)+1; + plain = plain<>2)+1; plain = hData[SHIDX(jj++)], idx++) + hData[SHIDX(idx)] = (plain>>ii)|(hData[SHIDX(jj)]<<(32-ii)); + hData[SHIDX(idx)] = plain>>ii; + for(idx++; idx < 14; idx++) + hData[SHIDX(idx)] = 0; + hData[SHIDX(idx++)] = size<<3; + hData[SHIDX(idx)] = 0; + + // hash + RC_MD5::MD5(hData); + + RTGEN_EPILOGUE +} diff --git a/BOINC software/BOINC client apps/distrrtgen_cuda/rcuda_ntlm.inc b/BOINC software/BOINC client apps/distrrtgen_cuda/rcuda_ntlm.inc new file mode 100644 index 0000000..1c14c5b --- /dev/null +++ b/BOINC software/BOINC client apps/distrrtgen_cuda/rcuda_ntlm.inc @@ -0,0 +1,145 @@ +//============================================================================ +// Name : rcuda_ntlm.inc +// Author : Jan Kyska +// Version : 1.00 +// Description : NTLM hash kernel for Generator of FreeRainbowTables +//============================================================================ + +namespace RC_NTLM { + +__device__ __constant__ unsigned int h[4] = { 0x67452301, 0xEFCDAB89, 0x98BADCFE, 0x10325476 }; +__device__ __constant__ unsigned char r[48] = { \ + 3, 7, 11, 19, 3, 7, 11, 19, 3, 7, 11, 19, 3, 7, 11, 19, \ + 3, 5, 9, 13, 3, 5, 9, 13, 3, 5, 9, 13, 3, 5, 9, 13, \ + 3, 9, 11, 15, 3, 9, 11, 15, 3, 9, 11, 15, 3, 9, 11, 15 }; +__device__ __constant__ unsigned char g[48] = { \ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \ + 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, \ + 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15 }; + +__device__ unsigned int FF(unsigned int a, unsigned int b, unsigned int c, unsigned int d, int i, const unsigned int* data) { + unsigned int ret; + ret = a + ((b&c)|((~b)&d)) + data[SHIDX(g[i])]; + ret = (ret<>(32-r[i])); + return ret; +} + +__device__ unsigned int GG(unsigned int a, unsigned int b, unsigned int c, unsigned int d, int i, const unsigned int* data) { + unsigned int ret; + ret = a + ((b&c)|(b&d)|(c&d)) + data[SHIDX(g[i])] + 0x5a827999u; + ret = (ret<>(32-r[i])); + return ret; +} + +__device__ unsigned int HH(unsigned int a, unsigned int b, unsigned int c, unsigned int d, int i, const unsigned int* data) { + unsigned int ret; + ret = a + (b^c^d) + data[SHIDX(g[i])] + 0x6ed9eba1u; + ret = (ret<>(32-r[i])); + return ret; +} + +__device__ void MD4(unsigned int* dataHash) { + unsigned int a = h[0], b = h[1], c = h[2], d = h[3], x; + int ii; + + // Round 1 + for(ii = 0; ii < 16; ii++) { + x = b; + b = FF(a, b, c, d, ii, dataHash); + a = d; d = c; c = x; + } + + // Round 2 + for(; ii < 32; ii++) { + x = b; + b = GG(a, b, c, d, ii, dataHash); + a = d; d = c; c = x; + } + + // Round 3 + for(; ii < 48; ii++) { + x = b; + b = HH(a, b, c, d, ii, dataHash); + a = d; d = c; c = x; + } + + dataHash[SHIDX(0)] = a + h[0]; + dataHash[SHIDX(1)] = b + h[1]; + dataHash[SHIDX(2)] = c + h[2]; + dataHash[SHIDX(3)] = d + h[3]; +} + +} + +__global__ void RTGenNTLMKernel(unsigned int chainStart, unsigned int chainStop) { + uint3 dimItem; + uint64 uiDiv64, uiVal64, uiMul64; + unsigned int uiVal, uiDiv; + unsigned int size, jj, kk; + unsigned int plain; + + RTGEN_PROLOGUE; + + // transform to the plain text + plain = 0x80; + jj = (PLAIN_MAX_SIZE>>1)+1; + + for(ii = kk = 0; idx64 > 0xfffffff0ull && ii < PLAIN_MAX_SIZE; ii++) { + uiVal64 = idx64 + cplStart[ii]; + uiVal64--; + dimItem = cplDimVec[ii]; + + uiMul64 = (uint64)dimItem.y<<32; + idx64 = __umul64hi(uiVal64, uiMul64); + uiDiv64 = uiVal64 - idx64*(uint64)dimItem.x; + uiVal = __umulhi((unsigned int)uiDiv64, dimItem.y); + uiDiv = (unsigned int)uiDiv64 - uiVal * dimItem.x; + idx64 += uiVal; + if(uiDiv >= dimItem.x) { + uiDiv -= dimItem.x; + idx64++; + } + plain = (plain<<8); + if((kk++)&1) { + hData[SHIDX(jj--)] = plain; + plain = 0; + } + plain = (plain<<8) | cplChrSet[dimItem.z + uiDiv]; + } + + for(idx = (unsigned int)idx64; idx != 0 && ii < PLAIN_MAX_SIZE; ii++) { + uiVal = idx + cplStart[ii]; + uiVal--; + dimItem = cplDimVec[ii]; + + idx = __umulhi(uiVal, dimItem.y); + uiDiv = uiVal - idx*dimItem.x; + if(uiDiv >= dimItem.x) { + uiDiv -= dimItem.x; + idx++; + } + plain = (plain<<8); + if((kk++)&1) { + hData[SHIDX(jj--)] = plain; + plain = 0; + } + plain = (plain<<8) | cplChrSet[dimItem.z + uiDiv]; + } + + // prepare for MD4 + size = (ii<<1); + ii = (((kk^1)&1)<<4)+8; + plain = plain<>1)+1; plain = hData[SHIDX(jj++)], idx++) + hData[SHIDX(idx)] = (plain>>ii)|(hData[SHIDX(jj)]<<(32-ii)); + hData[SHIDX(idx)] = plain>>ii; + for(idx++; idx < 14; idx++) + hData[SHIDX(idx)] = 0; + hData[SHIDX(idx++)] = size<<3; + hData[SHIDX(idx)] = 0; + + // hash + RC_NTLM::MD4(hData); + + RTGEN_EPILOGUE +} diff --git a/BOINC software/BOINC client apps/distrrtgen_cuda/rcuda_sha1.inc b/BOINC software/BOINC client apps/distrrtgen_cuda/rcuda_sha1.inc new file mode 100644 index 0000000..af621d2 --- /dev/null +++ b/BOINC software/BOINC client apps/distrrtgen_cuda/rcuda_sha1.inc @@ -0,0 +1,160 @@ +//============================================================================ +// Name : rcuda_sha1.inc +// Author : Jan Kyska +// Version : 1.00 +// Description : SHA1 hash kernel for Generator of FreeRainbowTables +//============================================================================ + +namespace RC_SHA1 { + +__device__ __constant__ unsigned int h[5] = { 0x67452301, 0xEFCDAB89, 0x98BADCFE, 0x10325476, 0xC3D2E1F0 }; + +__device__ unsigned int SwapEndian(unsigned int n) { + return (n<<24)|((n&0x0000ff00)<<8)|((n>>8)&0x0000ff00)|(n>>24); +} + +__device__ unsigned int FF(unsigned int a, unsigned int b, unsigned int c, unsigned int d, unsigned int e, int i, unsigned int* data) { + return ((a<<5)|(a>>27)) + ((b&c)|((~b)&d)) + e + 0x5A827999 + data[SHIDX(i)]; +} + +__device__ unsigned int FF2(unsigned int a, unsigned int b, unsigned int c, unsigned int d, unsigned int e, int i, unsigned int* data) { + unsigned int dt; + dt = data[SHIDX(i&15)]^data[SHIDX((i-3)&15)]^data[SHIDX((i-8)&15)]^data[SHIDX((i-14)&15)]; + data[SHIDX(i&15)] = dt = ((dt<<1)|(dt>>31)); + return ((a<<5)|(a>>27)) + ((b&c)|((~b)&d)) + e + 0x5A827999 + dt; +} + +__device__ unsigned int GG(unsigned int a, unsigned int b, unsigned int c, unsigned int d, unsigned int e, int i, unsigned int* data) { + unsigned int dt; + dt = data[SHIDX(i&15)]^data[SHIDX((i-3)&15)]^data[SHIDX((i-8)&15)]^data[SHIDX((i-14)&15)]; + data[SHIDX(i&15)] = dt = ((dt<<1)|(dt>>31)); + return ((a<<5)|(a>>27)) + (b^c^d) + e + 0x6ED9EBA1 + dt; +} + +__device__ unsigned int HH(unsigned int a, unsigned int b, unsigned int c, unsigned int d, unsigned int e, int i, unsigned int* data) { + unsigned int dt; + dt = data[SHIDX(i&15)]^data[SHIDX((i-3)&15)]^data[SHIDX((i-8)&15)]^data[SHIDX((i-14)&15)]; + data[SHIDX(i&15)] = dt = ((dt<<1)|(dt>>31)); + return ((a<<5)|(a>>27)) + ((b&c)|(b&d)|(c&d)) + e + 0x8F1BBCDC + dt; +} + +__device__ unsigned int II(unsigned int a, unsigned int b, unsigned int c, unsigned int d, unsigned int e, int i, unsigned int* data) { + unsigned int dt; + dt = data[SHIDX(i&15)]^data[SHIDX((i-3)&15)]^data[SHIDX((i-8)&15)]^data[SHIDX((i-14)&15)]; + data[SHIDX(i&15)] = dt = ((dt<<1)|(dt>>31)); + return ((a<<5)|(a>>27)) + (b^c^d) + e + 0xCA62C1D6 + dt; +} + +__device__ void SHA1(unsigned int* dataHash) { + unsigned int a = h[0], b = h[1], c = h[2], d = h[3], e = h[4], x; + int ii; + + // Round 1 + for(ii = 0; ii < 16; ii++) { + x = FF(a, b, c, d, e, ii, dataHash); + e = d; d = c; c = ((b<<30)|(b>>2)); b = a; a = x; + } + for(; ii < 20; ii++) { + x = FF2(a, b, c, d, e, ii, dataHash); + e = d; d = c; c = ((b<<30)|(b>>2)); b = a; a = x; + } + + // Round 2 + for(; ii < 40; ii++) { + x = GG(a, b, c, d, e, ii, dataHash); + e = d; d = c; c = ((b<<30)|(b>>2)); b = a; a = x; + } + + // Round 3 + for(; ii < 60; ii++) { + x = HH(a, b, c, d, e, ii, dataHash); + e = d; d = c; c = ((b<<30)|(b>>2)); b = a; a = x; + } + + // Round 4 + for(; ii < 80; ii++) { + x = II(a, b, c, d, e, ii, dataHash); + e = d; d = c; c = ((b<<30)|(b>>2)); b = a; a = x; + } + + dataHash[SHIDX(0)] = a + h[0]; + dataHash[SHIDX(1)] = b + h[1]; + dataHash[SHIDX(2)] = c + h[2]; + dataHash[SHIDX(3)] = d + h[3]; + dataHash[SHIDX(4)] = e + h[4]; +} + +} + +__global__ void RTGenSHA1Kernel(unsigned int chainStart, unsigned int chainStop) { + uint3 dimItem; + uint64 uiDiv64, uiVal64, uiMul64; + unsigned int uiVal, uiDiv; + unsigned int size, jj; + unsigned int plain; + + RTGEN_PROLOGUE; + + // transform to the plain text + plain = 0x80; + jj = (PLAIN_MAX_SIZE>>2)+1; + + for(ii = 0; idx64 > 0xfffffff0ull && ii < PLAIN_MAX_SIZE; ii++) { + uiVal64 = idx64 + cplStart[ii]; + uiVal64--; + dimItem = cplDimVec[ii]; + + uiMul64 = (uint64)dimItem.y<<32; + idx64 = __umul64hi(uiVal64, uiMul64); + uiDiv64 = uiVal64 - idx64*(uint64)dimItem.x; + uiVal = __umulhi((unsigned int)uiDiv64, dimItem.y); + uiDiv = (unsigned int)uiDiv64 - uiVal * dimItem.x; + idx64 += uiVal; + if(uiDiv >= dimItem.x) { + uiDiv -= dimItem.x; + idx64++; + } + plain = (plain<<8) | cplChrSet[dimItem.z + uiDiv]; + if((ii&3) == 2) { + hData[SHIDX(jj--)] = plain; + plain = 0; + } + } + + for(idx = (unsigned int)idx64; idx != 0 && ii < PLAIN_MAX_SIZE; ii++) { + uiVal = idx + cplStart[ii]; + uiVal--; + dimItem = cplDimVec[ii]; + + idx = __umulhi(uiVal, dimItem.y); + uiDiv = uiVal - idx*dimItem.x; + if(uiDiv >= dimItem.x) { + uiDiv -= dimItem.x; + idx++; + } + plain = (plain<<8) | cplChrSet[dimItem.z + uiDiv]; + if((ii&3) == 2) { + hData[SHIDX(jj--)] = plain; + plain = 0; + } + } + + // prepare for SHA1 + size = ii; + ii = ((((3-(ii&3))<<3)-1)&0x1f)+1; + plain = plain<>2)+1; plain = hData[SHIDX(jj++)], idx++) + hData[SHIDX(idx)] = RC_SHA1::SwapEndian((plain>>ii)|(hData[SHIDX(jj)]<<(32-ii))); + hData[SHIDX(idx)] = RC_SHA1::SwapEndian(plain>>ii); + for(idx++; idx < 14; idx++) + hData[SHIDX(idx)] = 0; + hData[SHIDX(idx++)] = 0; + hData[SHIDX(idx)] = size<<3; + + // hash + RC_SHA1::SHA1(hData); + + hData[SHIDX(0)] = RC_SHA1::SwapEndian(hData[SHIDX(0)]); + hData[SHIDX(1)] = RC_SHA1::SwapEndian(hData[SHIDX(1)]); + RTGEN_EPILOGUE +} -- 2.39.2