+//============================================================================
+// Name : rcuda.cu
+// Author : Jan Kyska
+// Version : 1.00
+// Description : Generator of FreeRainbowTables / MD5, MD4, NTLM, SHA1, LM
+//============================================================================
+
#include <stdio.h>
#include <cuda.h>
#include "rcuda.h"
#define GRID_X_L2 6
#define GRID_Y_L2 6
-#define BLOCK_X_L2 7
+#define BLOCK_X_L2 6
#define GRID_X_SIZE (1<<GRID_X_L2)
#define GRID_Y_SIZE (1<<GRID_Y_L2)
#define BLOCK_X_SIZE (1<<BLOCK_X_L2)
+#define PLAIN_MAX_SIZE 20
#define KERN_CHAIN_SIZE 100
+#define CHAR_SET_MAXLEN 512
#define SHIDX(x) ((x)<<4)
-__device__ __constant__ unsigned int h[4] = { 0x67452301, 0xEFCDAB89, 0x98BADCFE, 0x10325476 };
-__device__ __constant__ unsigned char r[64] = { \
- 7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22, \
- 5, 9, 14, 20, 5, 9, 14, 20, 5, 9, 14, 20, 5, 9, 14, 20, \
- 4, 11, 16, 23, 4, 11, 16, 23, 4, 11, 16, 23, 4, 11, 16, 23, \
- 6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, 21 };
-__device__ __constant__ unsigned char g[64] = { \
- 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \
- 1, 6, 11, 0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, \
- 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, \
- 0, 7, 14, 5, 12, 3, 10, 1, 8, 15, 6, 13, 4, 11, 2, 9 };
-__device__ __constant__ unsigned int ac[64] = { \
- 0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee, \
- 0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501, \
- 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be, \
- 0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821, \
- \
- 0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa, \
- 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8, \
- 0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed, \
- 0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a, \
- \
- 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c, \
- 0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70, \
- 0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05, \
- 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665, \
- \
- 0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039, \
- 0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1, \
- 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1, \
- 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391 };
-
-__device__ __constant__ unsigned int testData[16] = { \
- 0x79706d63, 0x6d627667, 0x00000080, 0x00000000, \
- 0x00000000, 0x00000000, 0x00000000, 0x00000000, \
- 0x00000000, 0x00000000, 0x00000000, 0x00000000, \
- 0x00000000, 0x00000000, 0x00000040, 0x00000000 };
-
-__device__ unsigned int FF(unsigned int a, unsigned int b, unsigned int c, unsigned int d, int i, const unsigned int* data) {
- unsigned int ret;
- ret = a + ((b&c)|((~b)&d)) + ac[i] + data[SHIDX(g[i])];
- ret = (ret<<r[i])|(ret>>(32-r[i]));
- ret += b;
- return ret;
-}
-
-__device__ unsigned int GG(unsigned int a, unsigned int b, unsigned int c, unsigned int d, int i, const unsigned int* data) {
- unsigned int ret;
- ret = a + ((b&d)|(c&(~d))) + ac[i] + data[SHIDX(g[i])];
- ret = (ret<<r[i])|(ret>>(32-r[i]));
- ret += b;
- return ret;
-}
-
-__device__ unsigned int HH(unsigned int a, unsigned int b, unsigned int c, unsigned int d, int i, const unsigned int* data) {
- unsigned int ret;
- ret = a + (b^c^d) + ac[i] + data[SHIDX(g[i])];
- ret = (ret<<r[i])|(ret>>(32-r[i]));
- ret += b;
- return ret;
-}
-
-__device__ unsigned int II(unsigned int a, unsigned int b, unsigned int c, unsigned int d, int i, const unsigned int* data) {
- unsigned int ret;
- ret = a + (c^(b|(~d))) + ac[i] + data[SHIDX(g[i])];
- ret = (ret<<r[i])|(ret>>(32-r[i]));
- ret += b;
- return ret;
-}
-
-__device__ void MD5(unsigned int* dataHash) {
- unsigned int a = h[0], b = h[1], c = h[2], d = h[3], x;
- int ii;
-
- // Round 1
- for(ii = 0; ii < 16; ii++) {
- x = b;
- b = FF(a, b, c, d, ii, dataHash);
- a = d; d = c; c = x;
- }
-
- // Round 2
- for(; ii < 32; ii++) {
- x = b;
- b = GG(a, b, c, d, ii, dataHash);
- a = d; d = c; c = x;
- }
-
- // Round 3
- for(; ii < 48; ii++) {
- x = b;
- b = HH(a, b, c, d, ii, dataHash);
- a = d; d = c; c = x;
- }
-
- // Round 4
- for(; ii < 64; ii++) {
- x = b;
- b = II(a, b, c, d, ii, dataHash);
- a = d; d = c; c = x;
- }
-
- dataHash[SHIDX(0)] = a + h[0];
- dataHash[SHIDX(1)] = b + h[1];
- dataHash[SHIDX(2)] = c + h[2];
- dataHash[SHIDX(3)] = d + h[3];
-}
-
-
-#define PLAIN_MAX_SIZE 20
-
-__device__ unsigned __int64 *dataHeap;
+__device__ uint64 *dataHeap;
__device__ unsigned char *plStart;
__device__ uint3 *plDimVec;
__device__ unsigned char *plChrSet;
__device__ int *plCpPos;
__device__ int plCpPosSize;
-__device__ unsigned __int64 reduceOffset;
-__device__ unsigned __int64 plainSpaceTotal;
-__device__ unsigned __int64 rPlainSpaceTotal;
-
-
-__global__ void RTGenMD5Kernel(unsigned int chainStart, unsigned int chainStop) {
- unsigned int *hData;
- uint3 dimItem;
- unsigned int uiVal, uiMul, uiDiv, idx;
- unsigned __int64 uiVal64, uiMul64, uiDiv64, idx64;
- unsigned int nPos, size, ii, jj, kk;
- unsigned int cpcheck, checkpoint;
- unsigned int plain;
-
- __shared__ unsigned int shData[SHIDX(BLOCK_X_SIZE)];
-
- if(threadIdx.x == 0) {
- nPos = ((((blockIdx.y<<GRID_X_L2) + blockIdx.x)<<BLOCK_X_L2) + threadIdx.x) << 1;
- for(ii = 0; ii < BLOCK_X_SIZE; ii++, nPos+=2) {
- hData = shData + ((ii>>4)<<8)+(ii&15);
- hData[SHIDX(0)] = dataHeap[nPos];
- hData[SHIDX(1)] = dataHeap[nPos]>>32;
- hData[SHIDX(2)] = dataHeap[nPos+1];
- }
- }
- __syncthreads();
-
- hData = shData + ((threadIdx.x>>4)<<8)+(threadIdx.x&15);
-
- idx64 = hData[SHIDX(1)];
- idx64 = (idx64<<32) | hData[SHIDX(0)];
- cpcheck = hData[SHIDX(2)];
- checkpoint = cpcheck&0x0000ffff;
- cpcheck = cpcheck>>16;
-
+__device__ uint64 reduceOffset;
+__device__ uint64 plainSpaceTotal;
+__device__ uint64 rPlainSpaceTotal;
+
+#define RTGEN_PROLOGUE \
+ unsigned int *hData; \
+ unsigned int idx; \
+ uint64 idx64; \
+ unsigned int nPos, ii; \
+ unsigned int cpcheck, checkpoint; \
+ \
+ __shared__ unsigned int shData[SHIDX(BLOCK_X_SIZE)]; \
+ __shared__ unsigned char cplChrSet[CHAR_SET_MAXLEN]; \
+ __shared__ unsigned char cplStart[PLAIN_MAX_SIZE]; \
+ __shared__ uint3 cplDimVec[PLAIN_MAX_SIZE]; \
+ \
+ if(threadIdx.x == 0) { \
+ nPos = ((((blockIdx.y<<GRID_X_L2) + blockIdx.x)<<BLOCK_X_L2) + threadIdx.x) << 1; \
+ for(ii = 0; ii < BLOCK_X_SIZE; ii++, nPos+=2) { \
+ hData = shData + ((ii>>4)<<8)+(ii&15); \
+ hData[SHIDX(0)] = dataHeap[nPos]; \
+ hData[SHIDX(1)] = dataHeap[nPos]>>32; \
+ hData[SHIDX(2)] = dataHeap[nPos+1]; \
+ } \
+ memcpy(cplChrSet, plChrSet, CHAR_SET_MAXLEN); \
+ memcpy(cplStart, plStart, PLAIN_MAX_SIZE); \
+ memcpy(cplDimVec, plDimVec, PLAIN_MAX_SIZE*sizeof(uint3)); \
+ } \
+ __syncthreads(); \
+ \
+ hData = shData + ((threadIdx.x>>4)<<8)+(threadIdx.x&15); \
+ \
+ idx64 = hData[SHIDX(1)]; \
+ idx64 = (idx64<<32) | hData[SHIDX(0)]; \
+ cpcheck = hData[SHIDX(2)]; \
+ checkpoint = cpcheck&0x0000ffff; \
+ cpcheck = cpcheck>>16; \
+ \
for(nPos = chainStart; nPos < chainStop; nPos++) {
- // transform to the plain text
- plain = 0x80;
- jj = (PLAIN_MAX_SIZE>>2)+1;
- for(ii = 0; idx64 > 0xfffffff0ull && ii < PLAIN_MAX_SIZE; ii++) {
- uiVal64 = idx64 + plStart[ii];
- uiVal64--;
- dimItem = plDimVec[ii];
- uiMul64 = (unsigned __int64)dimItem.y<<32;
- idx64 = __umul64hi(uiVal64, uiMul64);
- uiDiv64 = uiVal64 - idx64*(unsigned __int64)dimItem.x;
- uiVal = __umulhi((unsigned int)uiDiv64, dimItem.y);
- uiDiv = (unsigned int)uiDiv64 - uiVal * dimItem.x;
- idx64 += uiVal;
- if(uiDiv >= dimItem.x) {
- uiDiv -= dimItem.x;
- idx64++;
- }
- plain = (plain<<8) | plChrSet[dimItem.z + uiDiv];
- if((ii&3) == 2) {
- hData[SHIDX(jj--)] = plain;
- plain = 0;
- }
- }
- for(idx = (unsigned int)idx64; idx != 0 && ii < PLAIN_MAX_SIZE; ii++) {
- uiVal = idx + plStart[ii];
- uiVal--;
- dimItem = plDimVec[ii];
- idx = __umulhi(uiVal, dimItem.y);
- uiDiv = uiVal - idx*dimItem.x;
- if(uiDiv >= dimItem.x) {
- uiDiv -= dimItem.x;
- idx++;
- }
- plain = (plain<<8) | plChrSet[dimItem.z + uiDiv];
- if((ii&3) == 2) {
- hData[SHIDX(jj--)] = plain;
- plain = 0;
- }
- }
-
- // prepare for MD5
- size = ii;
- ii = ((((3-(ii&3))<<3)-1)&0x1f)+1;
- plain = plain<<ii;
- for(jj++, kk = 0; jj <= (PLAIN_MAX_SIZE>>2)+1; plain = hData[SHIDX(jj++)], kk++)
- hData[SHIDX(kk)] = (plain>>ii)|(hData[SHIDX(jj)]<<(32-ii));
- hData[SHIDX(kk)] = plain>>ii;
- for(kk++; kk < 14; kk++)
- hData[SHIDX(kk)] = 0;
- hData[SHIDX(kk++)] = size<<3;
- hData[SHIDX(kk)] = 0;
- // hash
- MD5(hData);
- idx64 = hData[SHIDX(1)];
- idx64 = (idx64<<32) | hData[SHIDX(0)];
- idx64 += reduceOffset + nPos;
- uiDiv64 = __umul64hi(idx64, rPlainSpaceTotal);
- idx64 -= uiDiv64*plainSpaceTotal;
- if(idx64 >= plainSpaceTotal)
- idx64 -= plainSpaceTotal;
-
- if(cpcheck < plCpPosSize && nPos == plCpPos[cpcheck]) {
- checkpoint |= ((unsigned int)idx64&1) << cpcheck;
- cpcheck++;
- }
+#define RTGEN_EPILOGUE \
+ idx64 = hData[SHIDX(1)]; \
+ idx64 = (idx64<<32) | hData[SHIDX(0)]; \
+ idx64 += reduceOffset + nPos; \
+ uiDiv64 = __umul64hi(idx64, rPlainSpaceTotal); \
+ idx64 -= uiDiv64*plainSpaceTotal; \
+ if(idx64 >= plainSpaceTotal) \
+ idx64 -= plainSpaceTotal; \
+ \
+ if(cpcheck < plCpPosSize && nPos == plCpPos[cpcheck]) { \
+ checkpoint |= ((unsigned int)idx64&1) << cpcheck; \
+ cpcheck++; \
+ } \
+ } \
+ \
+ hData[SHIDX(0)] = idx64; \
+ hData[SHIDX(1)] = idx64>>32; \
+ hData[SHIDX(2)] = (cpcheck<<16)|(checkpoint&0x0000ffff); \
+ __syncthreads(); \
+ \
+ if(threadIdx.x == 0) { \
+ nPos = ((((blockIdx.y<<GRID_X_L2) + blockIdx.x)<<BLOCK_X_L2) + threadIdx.x) << 1; \
+ for(ii = 0; ii < BLOCK_X_SIZE; ii++, nPos+=2) { \
+ hData = shData + ((ii>>4)<<8)+(ii&15); \
+ dataHeap[nPos] = ((uint64)hData[SHIDX(1)]<<32)|(uint64)hData[SHIDX(0)]; \
+ dataHeap[nPos+1] = hData[SHIDX(2)]; \
+ } \
}
- hData[SHIDX(0)] = idx64;
- hData[SHIDX(1)] = idx64>>32;
- hData[SHIDX(2)] = (cpcheck<<16)|(checkpoint&0x0000ffff);
- __syncthreads();
-
- if(threadIdx.x == 0) {
- nPos = ((((blockIdx.y<<GRID_X_L2) + blockIdx.x)<<BLOCK_X_L2) + threadIdx.x) << 1;
- for(ii = 0; ii < BLOCK_X_SIZE; ii++, nPos+=2) {
- hData = shData + ((ii>>4)<<8)+(ii&15);
- dataHeap[nPos] = ((unsigned __int64)hData[SHIDX(1)]<<32)|(unsigned __int64)hData[SHIDX(0)];
- dataHeap[nPos+1] = hData[SHIDX(2)];
- }
- }
- __syncthreads();
-}
+#include "rcuda_md5.inc"
+#include "rcuda_md4.inc"
+#include "rcuda_ntlm.inc"
+#include "rcuda_sha1.inc"
+#include "rcuda_lm.inc"
-extern "C" int CalcChainsOnCUDA(const rcuda::RCudaTask* task, unsigned __int64 *resultBuff) {
+extern "C" int CalcChainsOnCUDA(const rcuda::RCudaTask* task, uint64 *resultBuff) {
cudaError_t cuErr;
char buff[PLAIN_MAX_SIZE];
- unsigned __int64 *data;
+ uint64 *data;
unsigned char *stPlain;
uint3 *dimVec;
unsigned char *charSet;
int *cpPos;
- unsigned __int64 uiVal64;
- time_t tStart, tEnd;
- if(task->hash != rcuda::RHASH_MD5)
+ uint64 uiVal64;
+
+ if(task->charSetSize > CHAR_SET_MAXLEN)
+ return -1;
+
+ switch(task->hash) {
+ case rcuda::RHASH_MD5:
+ case rcuda::RHASH_MD4:
+ case rcuda::RHASH_NTLM:
+ case rcuda::RHASH_SHA1:
+ case rcuda::RHASH_LM:
+ break;
+ default:
return 0;
-
+ }
+
memset(buff, 0, PLAIN_MAX_SIZE);
- cudaMalloc((void**)&data, task->idxCount*2*sizeof(unsigned __int64));
+ cudaMalloc((void**)&data, task->idxCount*2*sizeof(uint64));
cudaMalloc((void**)&stPlain, PLAIN_MAX_SIZE);
- cudaMalloc((void**)&dimVec, task->dimVecSize*sizeof(uint3));
- cudaMalloc((void**)&charSet, task->charSetSize);
+ cudaMalloc((void**)&dimVec, max(task->dimVecSize, PLAIN_MAX_SIZE)*sizeof(uint3));
+ cudaMalloc((void**)&charSet, CHAR_SET_MAXLEN);
cudaMalloc((void**)&cpPos, task->cpPosSize*sizeof(int));
- cudaMemcpy(data, resultBuff, task->idxCount*2*sizeof(unsigned __int64), cudaMemcpyHostToDevice);
+ cudaMemcpy(data, resultBuff, task->idxCount*2*sizeof(uint64), cudaMemcpyHostToDevice);
cudaMemcpy(stPlain, buff, PLAIN_MAX_SIZE, cudaMemcpyHostToDevice);
cudaMemcpy(stPlain, task->stPlain, min(task->stPlainSize, PLAIN_MAX_SIZE), cudaMemcpyHostToDevice);
- cudaMemcpy(dimVec, task->dimVec, task->dimVecSize*sizeof(uint3), cudaMemcpyHostToDevice);
- cudaMemcpy(charSet, task->charSet, task->charSetSize, cudaMemcpyHostToDevice);
+ cudaMemcpy(dimVec, task->dimVec, min(task->dimVecSize, PLAIN_MAX_SIZE)*sizeof(uint3), cudaMemcpyHostToDevice);
+ cudaMemcpy(charSet, task->charSet, min(task->charSetSize, CHAR_SET_MAXLEN), cudaMemcpyHostToDevice);
cudaMemcpy(cpPos, task->cpPositions, task->cpPosSize*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(dataHeap, &data, sizeof(data));
cudaMemcpyToSymbol(plCpPosSize, &task->cpPosSize, sizeof(task->cpPosSize));
cudaMemcpyToSymbol(reduceOffset, &task->reduceOffset, sizeof(task->reduceOffset));
cudaMemcpyToSymbol(plainSpaceTotal, &task->plainSpaceTotal, sizeof(task->plainSpaceTotal));
- uiVal64 = (unsigned __int64)-1/task->plainSpaceTotal;
+ uiVal64 = (uint64)-1/task->plainSpaceTotal;
cudaMemcpyToSymbol(rPlainSpaceTotal, &uiVal64, sizeof(uiVal64));
int grSizeX = (task->idxCount-1)/BLOCK_X_SIZE + 1;
grSizeX = GRID_X_SIZE;
dim3 numBlocks(grSizeX, grSizeY);
cuErr = cudaSuccess;
- tStart = time(NULL);
+
+ cudaSetDeviceFlags(cudaDeviceBlockingSync);
+
for(int idx = 0; idx < task->rainbowChainLen-1 && cuErr == cudaSuccess; idx+=KERN_CHAIN_SIZE) {
- RTGenMD5Kernel<<<numBlocks, BLOCK_X_SIZE>>>(idx, min(idx+KERN_CHAIN_SIZE, task->rainbowChainLen-1));
+ switch(task->hash) {
+ case rcuda::RHASH_MD5:
+ RTGenMD5Kernel<<<numBlocks, BLOCK_X_SIZE>>>(idx, min(idx+KERN_CHAIN_SIZE, task->rainbowChainLen-1));
+ break;
+ case rcuda::RHASH_MD4:
+ RTGenMD4Kernel<<<numBlocks, BLOCK_X_SIZE>>>(idx, min(idx+KERN_CHAIN_SIZE, task->rainbowChainLen-1));
+ break;
+ case rcuda::RHASH_NTLM:
+ RTGenNTLMKernel<<<numBlocks, BLOCK_X_SIZE>>>(idx, min(idx+KERN_CHAIN_SIZE, task->rainbowChainLen-1));
+ break;
+ case rcuda::RHASH_SHA1:
+ RTGenSHA1Kernel<<<numBlocks, BLOCK_X_SIZE>>>(idx, min(idx+KERN_CHAIN_SIZE, task->rainbowChainLen-1));
+ break;
+ case rcuda::RHASH_LM:
+ RTGenLMKernel<<<numBlocks, BLOCK_X_SIZE>>>(idx, min(idx+KERN_CHAIN_SIZE, task->rainbowChainLen-1));
+ break;
+ }
cuErr = cudaGetLastError();
if(cuErr == cudaSuccess)
cuErr = cudaThreadSynchronize();
-
}
- tEnd = time(NULL);
- fprintf(stderr, "Kernel run time: %i\n", (tEnd - tStart));
if(cuErr == cudaSuccess)
- cudaMemcpy(resultBuff, data, task->idxCount*2*sizeof(unsigned __int64), cudaMemcpyDeviceToHost);
+ cudaMemcpy(resultBuff, data, task->idxCount*2*sizeof(uint64), cudaMemcpyDeviceToHost);
else
fprintf(stderr, "Error happened: %d (%s)\n", cuErr, cudaGetErrorString(cuErr));
grSizeX = GRID_X_SIZE;
return grSizeX*grSizeY*BLOCK_X_SIZE;
}
+
+extern "C" int SetCudaDevice(int device) {
+ return cudaSetDevice(device);
+}