]> git.sesse.net Git - freerainbowtables/commitdiff
merge all the algorithms for distrrtgen_cuda
authorJames Nobis <quel@quelrod.net>
Wed, 10 Nov 2010 04:21:22 +0000 (22:21 -0600)
committerJames Nobis <quel@quelrod.net>
Wed, 10 Nov 2010 04:21:22 +0000 (22:21 -0600)
clean up and fix for *nix and 64/32 compat
TBD boinc_temporary_exit() isn't functional on linux
http://bolt.berkeley.edu/trac/changeset/22382

BOINC software/BOINC client apps/distrrtgen_cuda/Makefile
BOINC software/BOINC client apps/distrrtgen_cuda/distrrtgen.cpp
BOINC software/BOINC client apps/distrrtgen_cuda/rcuda.cu
BOINC software/BOINC client apps/distrrtgen_cuda/rcuda.h
BOINC software/BOINC client apps/distrrtgen_cuda/rcuda_ext.cpp
BOINC software/BOINC client apps/distrrtgen_cuda/rcuda_lm.inc [new file with mode: 0644]
BOINC software/BOINC client apps/distrrtgen_cuda/rcuda_md4.inc [new file with mode: 0644]
BOINC software/BOINC client apps/distrrtgen_cuda/rcuda_md5.inc [new file with mode: 0644]
BOINC software/BOINC client apps/distrrtgen_cuda/rcuda_ntlm.inc [new file with mode: 0644]
BOINC software/BOINC client apps/distrrtgen_cuda/rcuda_sha1.inc [new file with mode: 0644]

index dbf0bc4def7d80e6fd182bb4d41d21a96aed351b..86eb1fa981fcc0cc101e2c2d331f305634c8e0ad 100644 (file)
@@ -103,5 +103,5 @@ rcuda_ext.o: rcuda_ext.h rcuda_ext.cpp $(COMMON_API_PATH)/ChainWalkContext.h
        $(CXX) $(CXXFLAGS) rcuda_ext.cpp
 
 rcuda.o: rcuda.h rcuda.cu $(COMMON_API_PATH)/Public.h $(COMMON_API_PATH)/global.h
-       /usr/local/cuda/bin/nvcc $(DEBUG) -I../../../Common/rt\ api --compile rcuda.cu
+       /usr/local/cuda/bin/nvcc $(OPTIMIZATION) $(DEBUG) -I../../../Common/rt\ api --compile rcuda.cu
 #      /usr/local/cuda/bin/nvcc -G $(DEBUG) --machine 32 -I../../../Common/rt\ api --compile rcuda.cu
index 0d7145d5bd900564ff59a81b31bd2ea00529d54a..9ed924ea062b26654e55847952e206f98a0e2125 100644 (file)
@@ -72,6 +72,7 @@ bool early_crash = false;
 bool early_sleep = false;
 double cpu_time = 20, comp_result;
 */
+/*
 int QuickSortPartition(RainbowChainCP* pChain, int nLow, int nHigh)
 {
        int nRandomIndex = nLow + ((uint32)rand() * ((uint32)RAND_MAX + 1) + (uint32)rand()) % (nHigh - nLow + 1);
@@ -104,7 +105,7 @@ void QuickSort(RainbowChainCP* pChain, int nLow, int nHigh)
                QuickSort(pChain, nPivotLoc + 1, nHigh);
        }
 }
-
+*/
 int main(int argc, char **argv) {    
     int retval;
     double fd;
@@ -115,16 +116,29 @@ int main(int argc, char **argv) {
         fprintf(stderr, "boinc_init returned %d\n", retval);
         exit(retval);
     }
-       
-
-    // get size of input file (used to compute fraction done)
-    //
-    //file_size(input_path, fsize);
-
-    // See if there's a valid checkpoint file.
-    // If so seek input file and truncate output file
-    //
 
+       // extract a --device option
+       std::vector<char*> argVec;
+       int cudaDevice = -1;
+       for(int ii = 0; ii < argc; ii++) {
+               if(cudaDevice < 0 && strcmp(argv[ii], "--device") == 0 && ii + 1 < argc)
+                       cudaDevice = atoi(argv[++ii]);
+               else
+                       argVec.push_back(argv[ii]);
+       }
+       argc = (int)argVec.size();
+       argv = &argVec[0];
+       if(!(cudaDevice < 0))
+               // set the cuda device
+               if(rcuda::SetCudaDevice(cudaDevice) != 0)
+               {
+                       //XXX this call doesn't work on linux
+                       // fixed in upstream source 2010-09-16
+                       // http://bolt.berkeley.edu/trac/changeset/22382
+                       #ifdef _WIN32
+                               boinc_temporary_exit(60);
+                       #endif
+               }
 
        if(argc < 10)
        {
@@ -210,9 +224,7 @@ int main(int argc, char **argv) {
 
        
        // Open file
-//     fclose(fopen(sFilename.c_str(), "a"));
-//     FILE* file = fopen(sFilename.c_str(), "r+b");
-    boinc_resolve_filename("result", output_path, sizeof(output_path));
+       boinc_resolve_filename("result", output_path, sizeof(output_path));
        fclose(boinc_fopen(output_path, "a"));
        FILE *outfile = boinc_fopen(output_path, "r+b");
        
@@ -228,8 +240,8 @@ int main(int argc, char **argv) {
        unsigned int nFileLen;
        
        // Round to boundary
-       nDataLen = nDataLen / 18 * 18;
-       if (nDataLen == nRainbowChainCount * 18)
+       nDataLen = nDataLen / 10 * 10;
+       if (nDataLen == nRainbowChainCount * 10)
        {               
                std::cerr << "precomputation of this rainbow table already finished" << std::endl;
                fclose(outfile);
@@ -237,12 +249,14 @@ int main(int argc, char **argv) {
        }
 
        fseek(outfile, nDataLen, SEEK_SET);
+       //XXX size_t isn't 32/64 clean
        size_t nReturn;
        CChainWalkContext cwc;
        uint64 nIndex[2];
+       time_t tStart = time(NULL);
 
 //     std::cout << "Starting to generate chains" << std::endl;
-       int maxCalcBuffSize = rcuda::GetChainsBufferSize(5000);
+       int maxCalcBuffSize = rcuda::GetChainsBufferSize(0x2000);
        uint64 *calcBuff = new uint64[2*maxCalcBuffSize];
        int ii;
 
@@ -250,10 +264,8 @@ int main(int argc, char **argv) {
        rcuda::RCudaTask cuTask;
        std::vector<unsigned char> stPlain;
        ex.Init();
-time_t tStart, tStartFinal, tEndFinal;
-time_t tEnd;
-       tStartFinal = time(NULL);
-       for(uint32 nCurrentCalculatedChains = nDataLen / 18, calcSize; nCurrentCalculatedChains < nRainbowChainCount; )
+
+       for(int nCurrentCalculatedChains = nDataLen / 10, calcSize; nCurrentCalculatedChains < nRainbowChainCount; )
        {               
                fd = (double)nCurrentCalculatedChains / (double)nRainbowChainCount;
                boinc_fraction_done(fd);
@@ -276,21 +288,16 @@ time_t tEnd;
                        calcBuff[2*ii] = cuTask.startIdx + ii;
                        calcBuff[2*ii+1] = 0;
                }
-
-               tStart = time(NULL);
-
                calcSize = rcuda::CalcChainsOnCUDA(&cuTask, calcBuff);
-               tEnd = time(NULL);
-               std::cerr << "CUDA time taken: " << tEnd - tStart << std::endl;
-               tStart = time(NULL);
+
                if(calcSize > 0) {
                        nCurrentCalculatedChains += calcSize;
                        for(ii = 0; ii < cuTask.idxCount; ii++) {
                                nIndex[0] = cuTask.startIdx + ii;
-                               nReturn = fwrite(nIndex, 1, 8, outfile);
-                               nReturn += fwrite(calcBuff+(2*ii), 1, 8, outfile);
+//                             nReturn = fwrite(nIndex, 1, 8, outfile);
+                               nReturn = fwrite(calcBuff+(2*ii), 1, 8, outfile);
                                nReturn += fwrite(calcBuff+(2*ii+1), 1, 2, outfile);
-                               if(nReturn != 18) {
+                               if(nReturn != 10) {
                                        std::cerr << "disk write fail" << std::endl;
                                        fclose(outfile);
                                        return 9;
@@ -302,12 +309,11 @@ time_t tEnd;
                        return 0;
                }
        }
-       tEndFinal = time(NULL);
-       std::cerr << "Time taken: " << tEndFinal - tStartFinal << " secs" << std::endl;
        delete [] calcBuff;
 #ifdef _DEBUG
        std::cout << "Generation completed" << std::endl;
 #endif
+/*
     fseek(outfile, 0, SEEK_SET);
        nFileLen = GetFileLen(outfile);
        nRainbowChainCount = nFileLen / 18;
@@ -347,7 +353,7 @@ time_t tEnd;
                }
                delete[] pChain;
        }
-
+*/
        fclose(outfile);
     
        // main loop - read characters, convert to UC, write
@@ -368,6 +374,3 @@ int WINAPI WinMain(HINSTANCE hInst, HINSTANCE hPrevInst, LPSTR Args, int WinMode
     return main(argc, argv);
 }
 #endif
-
-const char *BOINC_RCSID_33ac47a071 = "$Id: upper_case.C 12135 2007-02-21 20:04:14Z davea $";
-
index 187b10586ad637b9d67b7d79f1199548669d1a9a..b626041ede05721baf29f545226e9edcb5c9a524 100644 (file)
+//============================================================================
+// Name        : rcuda.cu
+// Author      : Jan Kyska
+// Version     : 1.00
+// Description : Generator of FreeRainbowTables / MD5, MD4, NTLM, SHA1, LM
+//============================================================================ 
+
 #include <stdio.h>  
 #include <cuda.h>  
 #include "rcuda.h"
 
 #define   GRID_X_L2     6
 #define   GRID_Y_L2     6
-#define   BLOCK_X_L2    7
+#define   BLOCK_X_L2    6
 #define   GRID_X_SIZE   (1<<GRID_X_L2)
 #define   GRID_Y_SIZE   (1<<GRID_Y_L2)
 #define   BLOCK_X_SIZE  (1<<BLOCK_X_L2)
+#define   PLAIN_MAX_SIZE     20
 #define   KERN_CHAIN_SIZE   100
+#define   CHAR_SET_MAXLEN   512
 #define   SHIDX(x)      ((x)<<4)
 
 
-__device__ __constant__ unsigned int h[4] = { 0x67452301, 0xEFCDAB89, 0x98BADCFE, 0x10325476 };
-__device__ __constant__ unsigned char r[64] = { \
-                                                       7, 12, 17, 22,  7, 12, 17, 22,  7, 12, 17, 22,  7, 12, 17, 22, \
-                                                       5,  9, 14, 20,  5,  9, 14, 20,  5,  9, 14, 20,  5,  9, 14, 20, \
-                                                       4, 11, 16, 23,  4, 11, 16, 23,  4, 11, 16, 23,  4, 11, 16, 23, \
-                                                       6, 10, 15, 21,  6, 10, 15, 21,  6, 10, 15, 21,  6, 10, 15, 21 };
-__device__ __constant__ unsigned char g[64] = { \
-                                                       0, 1,  2,  3,   4,  5,  6,  7,   8,  9, 10, 11,  12, 13, 14, 15, \
-                                                       1, 6, 11,  0,   5, 10, 15,  4,   9, 14,  3,  8,  13,  2,  7, 12, \
-                                                       5, 8, 11, 14,   1,  4,  7, 10,  13,  0,  3,  6,   9, 12, 15,  2, \
-                                                       0, 7, 14,  5,  12,  3, 10,  1,   8, 15,  6, 13,   4, 11,  2,  9 };
-__device__ __constant__ unsigned int ac[64] = { \
-                                                       0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee, \
-                                                       0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501, \
-                                                       0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be, \
-                                                       0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821, \
-                                                       \
-                                                       0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa, \
-                                                       0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8, \
-                                                       0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed, \
-                                                       0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a, \
-                                                       \
-                                                       0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c, \
-                                                       0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70, \
-                                                       0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05, \
-                                                       0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665, \
-                                                       \
-                                                       0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039, \
-                                                       0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1, \
-                                                       0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1, \
-                                                       0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391 };
-
-__device__ __constant__ unsigned int testData[16] = { \
-                                                       0x79706d63, 0x6d627667, 0x00000080, 0x00000000, \
-                                                       0x00000000, 0x00000000, 0x00000000, 0x00000000, \
-                                                       0x00000000, 0x00000000, 0x00000000, 0x00000000, \
-                                                       0x00000000, 0x00000000, 0x00000040, 0x00000000 };
-
-__device__ unsigned int FF(unsigned int a, unsigned int b, unsigned int c, unsigned int d, int i, const unsigned int* data) {
-       unsigned int ret;
-       ret = a + ((b&c)|((~b)&d)) + ac[i] + data[SHIDX(g[i])];
-       ret = (ret<<r[i])|(ret>>(32-r[i]));
-       ret += b;
-       return ret;
-}
-
-__device__ unsigned int GG(unsigned int a, unsigned int b, unsigned int c, unsigned int d, int i, const unsigned int* data) {
-       unsigned int ret;
-       ret = a + ((b&d)|(c&(~d))) + ac[i] + data[SHIDX(g[i])];
-       ret = (ret<<r[i])|(ret>>(32-r[i]));
-       ret += b;
-       return ret;
-}
-
-__device__ unsigned int HH(unsigned int a, unsigned int b, unsigned int c, unsigned int d, int i, const unsigned int* data) {
-       unsigned int ret;
-       ret = a + (b^c^d) + ac[i] + data[SHIDX(g[i])];
-       ret = (ret<<r[i])|(ret>>(32-r[i]));
-       ret += b;
-       return ret;
-}
-
-__device__ unsigned int II(unsigned int a, unsigned int b, unsigned int c, unsigned int d, int i, const unsigned int* data) {
-       unsigned int ret;
-       ret = a + (c^(b|(~d))) + ac[i] + data[SHIDX(g[i])];
-       ret = (ret<<r[i])|(ret>>(32-r[i]));
-       ret += b;
-       return ret;
-}
-
-__device__ void MD5(unsigned int* dataHash) {
-       unsigned int a = h[0], b = h[1], c = h[2], d = h[3], x;
-       int ii;
-
-       // Round 1
-       for(ii = 0; ii < 16; ii++) {
-               x = b;
-               b = FF(a, b, c, d, ii, dataHash);
-               a = d; d = c; c = x;
-       }
-
-       // Round 2
-       for(; ii < 32; ii++) {
-               x = b;
-               b = GG(a, b, c, d, ii, dataHash);
-               a = d; d = c; c = x;
-       }
-       
-       // Round 3
-       for(; ii < 48; ii++) {
-               x = b;
-               b = HH(a, b, c, d, ii, dataHash);
-               a = d; d = c; c = x;
-       }
-       
-       // Round 4
-       for(; ii < 64; ii++) {
-               x = b;
-               b = II(a, b, c, d, ii, dataHash);
-               a = d; d = c; c = x;
-       }
-
-       dataHash[SHIDX(0)] = a + h[0];
-       dataHash[SHIDX(1)] = b + h[1];
-       dataHash[SHIDX(2)] = c + h[2];
-       dataHash[SHIDX(3)] = d + h[3];
-}
-
-
-#define   PLAIN_MAX_SIZE     20
-
 __device__ uint64 *dataHeap;
 __device__ unsigned char *plStart;
 __device__ uint3 *plDimVec;
@@ -133,123 +31,78 @@ __device__ uint64 reduceOffset;
 __device__ uint64 plainSpaceTotal;
 __device__ uint64 rPlainSpaceTotal;
 
-
-__global__ void RTGenMD5Kernel(unsigned int chainStart, unsigned int chainStop) {
-       unsigned int *hData;
-       uint3 dimItem;
-       unsigned int uiVal, uiMul, uiDiv, idx;
-       uint64 uiVal64, uiMul64, uiDiv64, idx64;
-       unsigned int nPos, size, ii, jj, kk;
-       unsigned int cpcheck, checkpoint;
-       unsigned int plain;
-
-       __shared__ unsigned int shData[SHIDX(BLOCK_X_SIZE)];
-
-       if(threadIdx.x == 0) {
-               nPos = ((((blockIdx.y<<GRID_X_L2) + blockIdx.x)<<BLOCK_X_L2) + threadIdx.x) << 1;
-               for(ii = 0; ii < BLOCK_X_SIZE; ii++, nPos+=2) {
-                       hData = shData + ((ii>>4)<<8)+(ii&15);
-                       hData[SHIDX(0)] = dataHeap[nPos];
-                       hData[SHIDX(1)] = dataHeap[nPos]>>32;
-                       hData[SHIDX(2)] = dataHeap[nPos+1];
-               }
-       }
-       __syncthreads();
-
-       hData = shData + ((threadIdx.x>>4)<<8)+(threadIdx.x&15);
-       
-       idx64 = hData[SHIDX(1)];
-       idx64 = (idx64<<32) | hData[SHIDX(0)];
-       cpcheck = hData[SHIDX(2)];
-       checkpoint = cpcheck&0x0000ffff;
-       cpcheck = cpcheck>>16;
-
+#define RTGEN_PROLOGUE  \
+       unsigned int *hData;  \
+       unsigned int idx;  \
+       uint64 idx64;  \
+       unsigned int nPos, ii;  \
+       unsigned int cpcheck, checkpoint;  \
+       \
+       __shared__ unsigned int shData[SHIDX(BLOCK_X_SIZE)];  \
+       __shared__ unsigned char cplChrSet[CHAR_SET_MAXLEN];  \
+       __shared__ unsigned char cplStart[PLAIN_MAX_SIZE];  \
+       __shared__ uint3 cplDimVec[PLAIN_MAX_SIZE];  \
+       \
+       if(threadIdx.x == 0) {  \
+               nPos = ((((blockIdx.y<<GRID_X_L2) + blockIdx.x)<<BLOCK_X_L2) + threadIdx.x) << 1;  \
+               for(ii = 0; ii < BLOCK_X_SIZE; ii++, nPos+=2) {  \
+                       hData = shData + ((ii>>4)<<8)+(ii&15);  \
+                       hData[SHIDX(0)] = dataHeap[nPos];  \
+                       hData[SHIDX(1)] = dataHeap[nPos]>>32;  \
+                       hData[SHIDX(2)] = dataHeap[nPos+1];  \
+               }  \
+               memcpy(cplChrSet, plChrSet, CHAR_SET_MAXLEN);  \
+               memcpy(cplStart, plStart, PLAIN_MAX_SIZE);  \
+               memcpy(cplDimVec, plDimVec, PLAIN_MAX_SIZE*sizeof(uint3));  \
+       }  \
+       __syncthreads();  \
+       \
+       hData = shData + ((threadIdx.x>>4)<<8)+(threadIdx.x&15);  \
+       \
+       idx64 = hData[SHIDX(1)];  \
+       idx64 = (idx64<<32) | hData[SHIDX(0)];  \
+       cpcheck = hData[SHIDX(2)];  \
+       checkpoint = cpcheck&0x0000ffff;  \
+       cpcheck = cpcheck>>16;  \
+       \
        for(nPos = chainStart; nPos < chainStop; nPos++) {
-               // transform to the plain text
-               plain = 0x80;
-               jj = (PLAIN_MAX_SIZE>>2)+1;
-               for(ii = 0; idx64 > 0xfffffff0ull && ii < PLAIN_MAX_SIZE; ii++) {
-                       uiVal64 = idx64 + plStart[ii];
-                       uiVal64--;
-                       dimItem = plDimVec[ii];
-                       uiMul64 = (uint64)dimItem.y<<32;
-                       idx64 = __umul64hi(uiVal64, uiMul64);
-                       uiDiv64 = uiVal64 - idx64*(uint64)dimItem.x;
-                       uiVal = __umulhi((unsigned int)uiDiv64, dimItem.y);
-                       uiDiv = (unsigned int)uiDiv64 - uiVal * dimItem.x;
-                       idx64 += uiVal;
-                       if(uiDiv >= dimItem.x) {
-                               uiDiv -= dimItem.x;
-                               idx64++;
-                       }
-                       plain = (plain<<8) | plChrSet[dimItem.z + uiDiv];
-                       if((ii&3) == 2) {
-                               hData[SHIDX(jj--)] = plain;
-                               plain = 0;
-                       }
-               }
-               for(idx = (unsigned int)idx64; idx != 0 && ii < PLAIN_MAX_SIZE; ii++) {
-                       uiVal = idx + plStart[ii];
-                       uiVal--;
-                       dimItem = plDimVec[ii];
-                       idx = __umulhi(uiVal, dimItem.y);
-                       uiDiv = uiVal - idx*dimItem.x;
-                       if(uiDiv >= dimItem.x) {
-                               uiDiv -= dimItem.x;
-                               idx++;
-                       }
-                       plain = (plain<<8) | plChrSet[dimItem.z + uiDiv];
-                       if((ii&3) == 2) {
-                               hData[SHIDX(jj--)] = plain;
-                               plain = 0;
-                       }
-               }
-
-               // prepare for MD5
-               size = ii;
-               ii = ((((3-(ii&3))<<3)-1)&0x1f)+1;
-               plain = plain<<ii;
-               for(jj++, kk = 0; jj <= (PLAIN_MAX_SIZE>>2)+1; plain = hData[SHIDX(jj++)], kk++)
-                       hData[SHIDX(kk)] = (plain>>ii)|(hData[SHIDX(jj)]<<(32-ii));
-               hData[SHIDX(kk)] = plain>>ii;
-               for(kk++; kk < 14; kk++)
-                       hData[SHIDX(kk)] = 0;
-               hData[SHIDX(kk++)] = size<<3;
-               hData[SHIDX(kk)] = 0;
 
-               // hash
-               MD5(hData);
 
-               idx64 = hData[SHIDX(1)];
-               idx64 = (idx64<<32) | hData[SHIDX(0)];
-               idx64 += reduceOffset + nPos;
-               uiDiv64 = __umul64hi(idx64, rPlainSpaceTotal);
-               idx64 -= uiDiv64*plainSpaceTotal;
-               if(idx64 >= plainSpaceTotal)
-                       idx64 -= plainSpaceTotal;
-                       
-               if(cpcheck < plCpPosSize && nPos == plCpPos[cpcheck]) {
-                       checkpoint |= ((unsigned int)idx64&1) << cpcheck;
-                       cpcheck++;
-               }
+#define RTGEN_EPILOGUE  \
+               idx64 = hData[SHIDX(1)];  \
+               idx64 = (idx64<<32) | hData[SHIDX(0)];  \
+               idx64 += reduceOffset + nPos;  \
+               uiDiv64 = __umul64hi(idx64, rPlainSpaceTotal);  \
+               idx64 -= uiDiv64*plainSpaceTotal;  \
+               if(idx64 >= plainSpaceTotal)  \
+                       idx64 -= plainSpaceTotal;  \
+               \
+               if(cpcheck < plCpPosSize && nPos == plCpPos[cpcheck]) {  \
+                       checkpoint |= ((unsigned int)idx64&1) << cpcheck;  \
+                       cpcheck++;  \
+               }  \
+       }  \
+       \
+       hData[SHIDX(0)] = idx64;  \
+       hData[SHIDX(1)] = idx64>>32;  \
+       hData[SHIDX(2)] = (cpcheck<<16)|(checkpoint&0x0000ffff);  \
+       __syncthreads();  \
+       \
+       if(threadIdx.x == 0) {  \
+               nPos = ((((blockIdx.y<<GRID_X_L2) + blockIdx.x)<<BLOCK_X_L2) + threadIdx.x) << 1;  \
+               for(ii = 0; ii < BLOCK_X_SIZE; ii++, nPos+=2) {  \
+                       hData = shData + ((ii>>4)<<8)+(ii&15);  \
+                       dataHeap[nPos] = ((uint64)hData[SHIDX(1)]<<32)|(uint64)hData[SHIDX(0)];  \
+                       dataHeap[nPos+1] = hData[SHIDX(2)];  \
+               }  \
        }
 
-       hData[SHIDX(0)] = idx64;
-       hData[SHIDX(1)] = idx64>>32;
-       hData[SHIDX(2)] = (cpcheck<<16)|(checkpoint&0x0000ffff);
-       __syncthreads();
-       
-       if(threadIdx.x == 0) {
-               nPos = ((((blockIdx.y<<GRID_X_L2) + blockIdx.x)<<BLOCK_X_L2) + threadIdx.x) << 1;
-               for(ii = 0; ii < BLOCK_X_SIZE; ii++, nPos+=2) {
-                       hData = shData + ((ii>>4)<<8)+(ii&15);
-                       dataHeap[nPos] = ((uint64)hData[SHIDX(1)]<<32)|(uint64)hData[SHIDX(0)];
-                       dataHeap[nPos+1] = hData[SHIDX(2)];
-               }
-       }
-       __syncthreads();
-}
 
+#include "rcuda_md5.inc"
+#include "rcuda_md4.inc"
+#include "rcuda_ntlm.inc"
+#include "rcuda_sha1.inc"
+#include "rcuda_lm.inc"
 
 extern "C" int CalcChainsOnCUDA(const rcuda::RCudaTask* task, uint64 *resultBuff) {
        cudaError_t cuErr;
@@ -260,22 +113,33 @@ extern "C" int CalcChainsOnCUDA(const rcuda::RCudaTask* task, uint64 *resultBuff
        unsigned char *charSet;
        int *cpPos;
        uint64 uiVal64;
-       time_t tStart, tEnd;
-       if(task->hash != rcuda::RHASH_MD5)
-               return 0;
 
+       if(task->charSetSize > CHAR_SET_MAXLEN)
+               return -1;
+
+       switch(task->hash) {
+       case rcuda::RHASH_MD5:
+       case rcuda::RHASH_MD4:
+       case rcuda::RHASH_NTLM:
+       case rcuda::RHASH_SHA1:
+       case rcuda::RHASH_LM:
+               break;
+       default:        
+               return 0;
+       }
+       
        memset(buff, 0, PLAIN_MAX_SIZE);
        cudaMalloc((void**)&data, task->idxCount*2*sizeof(uint64));
        cudaMalloc((void**)&stPlain, PLAIN_MAX_SIZE);
-       cudaMalloc((void**)&dimVec, task->dimVecSize*sizeof(uint3));
-       cudaMalloc((void**)&charSet, task->charSetSize);
+       cudaMalloc((void**)&dimVec, max(task->dimVecSize, PLAIN_MAX_SIZE)*sizeof(uint3));
+       cudaMalloc((void**)&charSet, CHAR_SET_MAXLEN);
        cudaMalloc((void**)&cpPos, task->cpPosSize*sizeof(int));
 
        cudaMemcpy(data, resultBuff, task->idxCount*2*sizeof(uint64), cudaMemcpyHostToDevice);
        cudaMemcpy(stPlain, buff, PLAIN_MAX_SIZE, cudaMemcpyHostToDevice);
        cudaMemcpy(stPlain, task->stPlain, min(task->stPlainSize, PLAIN_MAX_SIZE), cudaMemcpyHostToDevice);
-       cudaMemcpy(dimVec, task->dimVec, task->dimVecSize*sizeof(uint3), cudaMemcpyHostToDevice);
-       cudaMemcpy(charSet, task->charSet, task->charSetSize, cudaMemcpyHostToDevice);
+       cudaMemcpy(dimVec, task->dimVec, min(task->dimVecSize, PLAIN_MAX_SIZE)*sizeof(uint3), cudaMemcpyHostToDevice);
+       cudaMemcpy(charSet, task->charSet, min(task->charSetSize, CHAR_SET_MAXLEN), cudaMemcpyHostToDevice);
        cudaMemcpy(cpPos, task->cpPositions, task->cpPosSize*sizeof(int), cudaMemcpyHostToDevice);
 
        cudaMemcpyToSymbol(dataHeap, &data, sizeof(data));
@@ -294,16 +158,30 @@ extern "C" int CalcChainsOnCUDA(const rcuda::RCudaTask* task, uint64 *resultBuff
        grSizeX = GRID_X_SIZE;
        dim3 numBlocks(grSizeX, grSizeY);
        cuErr = cudaSuccess;
-       tStart = time(NULL);
+       
+       cudaSetDeviceFlags(cudaDeviceBlockingSync);
        for(int idx = 0; idx < task->rainbowChainLen-1 && cuErr == cudaSuccess; idx+=KERN_CHAIN_SIZE) {
-               RTGenMD5Kernel<<<numBlocks, BLOCK_X_SIZE>>>(idx, min(idx+KERN_CHAIN_SIZE, task->rainbowChainLen-1));
+               switch(task->hash) {
+               case rcuda::RHASH_MD5:
+                       RTGenMD5Kernel<<<numBlocks, BLOCK_X_SIZE>>>(idx, min(idx+KERN_CHAIN_SIZE, task->rainbowChainLen-1));
+                       break;
+               case rcuda::RHASH_MD4:
+                       RTGenMD4Kernel<<<numBlocks, BLOCK_X_SIZE>>>(idx, min(idx+KERN_CHAIN_SIZE, task->rainbowChainLen-1));
+                       break;
+               case rcuda::RHASH_NTLM:
+                       RTGenNTLMKernel<<<numBlocks, BLOCK_X_SIZE>>>(idx, min(idx+KERN_CHAIN_SIZE, task->rainbowChainLen-1));
+                       break;
+               case rcuda::RHASH_SHA1:
+                       RTGenSHA1Kernel<<<numBlocks, BLOCK_X_SIZE>>>(idx, min(idx+KERN_CHAIN_SIZE, task->rainbowChainLen-1));
+                       break;
+               case rcuda::RHASH_LM:
+                       RTGenLMKernel<<<numBlocks, BLOCK_X_SIZE>>>(idx, min(idx+KERN_CHAIN_SIZE, task->rainbowChainLen-1));
+                       break;
+               }
                cuErr = cudaGetLastError();
                if(cuErr == cudaSuccess)
                        cuErr = cudaThreadSynchronize();
-               
        }
-       tEnd = time(NULL);
-       fprintf(stderr, "Kernel run time: %i\n", (tEnd - tStart));
 
        if(cuErr == cudaSuccess)
                cudaMemcpy(resultBuff, data, task->idxCount*2*sizeof(uint64), cudaMemcpyDeviceToHost);
@@ -324,3 +202,7 @@ extern "C" int GetChainsBufferSize(int minSize) {
        grSizeX = GRID_X_SIZE;
        return grSizeX*grSizeY*BLOCK_X_SIZE;
 }
+
+extern "C" int SetCudaDevice(int device) {
+       return cudaSetDevice(device);
+}
index 370b0646653f895cd6fec06cc796cd66acd0ee04..c641eda533fb735c5cca51378e1d97c0afc87c56 100644 (file)
@@ -24,8 +24,9 @@ struct RCudaTask {
        unsigned int rainbowChainLen;
 };
 
-extern "C" int CalcChainsOnCUDA(const RCudaTask* task, uint64 *resultBuff);
+extern "C" int SetCudaDevice(int device);
 extern "C" int GetChainsBufferSize(int minSize);
+extern "C" int CalcChainsOnCUDA(const RCudaTask* task, uint64 *resultBuff);
 
 }
 
index 74b8d9e37ad4590b1e8d8fa9748cf99ac95529dd..ea78a0d905eea32ce361e166272ecc313e6c3abf 100644 (file)
@@ -1,7 +1,7 @@
 //============================================================================
 // Name        : rcuda_ext.cpp
 // Author      : Jan Kyska
-// Version     : 0.9
+// Version     : 1.00
 // Description : A code to access internals of the CChainWalkContext 
 //               for the CUDA generator of FreeRainbowTables
 //============================================================================ 
@@ -76,7 +76,8 @@ int CudaCWCExtender::IndexToStartPlain(const uint64 nIndex, std::vector<unsigned
                        stCharset &chs = CChainWalkContext::m_vCharset[jj];
                        nCharsetLen += chs.m_nPlainLenMax;
                        if(ii < nCharsetLen) { // We found the correct charset
-                               stPlain.push_back(nIndexOfX % chs.m_nPlainCharsetLen + 1);
+                               //XXX from md5 only cuda stPlain.push_back(nIndexOfX % chs.m_nPlainCharsetLen + 1);
+                               stPlain.push_back((unsigned char)(nIndexOfX % (uint64)chs.m_nPlainCharsetLen + 1));
                                nIndexOfX /= chs.m_nPlainCharsetLen;
                        }
                }
diff --git a/BOINC software/BOINC client apps/distrrtgen_cuda/rcuda_lm.inc b/BOINC software/BOINC client apps/distrrtgen_cuda/rcuda_lm.inc
new file mode 100644 (file)
index 0000000..d1fbc2b
--- /dev/null
@@ -0,0 +1,479 @@
+//============================================================================\r
+// Name        : rcuda_lm.inc\r
+// Author      : Jan Kyska\r
+// Version     : 1.00\r
+// Description : LM hash kernel for Generator of FreeRainbowTables\r
+//============================================================================ \r
+\r
+namespace RC_LM {\r
+\r
+#define  ITERATIONS    16\r
+\r
+\r
+__device__ __constant__ int shifts2[16] = { 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0 };\r
+\r
+__device__ __constant__ unsigned int des_skb[8][64] = {\r
+{\r
+/* for C bits (numbered as per FIPS 46) 1 2 3 4 5 6 */\r
+0x00000000L,0x00000010L,0x20000000L,0x20000010L,\r
+0x00010000L,0x00010010L,0x20010000L,0x20010010L,\r
+0x00000800L,0x00000810L,0x20000800L,0x20000810L,\r
+0x00010800L,0x00010810L,0x20010800L,0x20010810L,\r
+0x00000020L,0x00000030L,0x20000020L,0x20000030L,\r
+0x00010020L,0x00010030L,0x20010020L,0x20010030L,\r
+0x00000820L,0x00000830L,0x20000820L,0x20000830L,\r
+0x00010820L,0x00010830L,0x20010820L,0x20010830L,\r
+0x00080000L,0x00080010L,0x20080000L,0x20080010L,\r
+0x00090000L,0x00090010L,0x20090000L,0x20090010L,\r
+0x00080800L,0x00080810L,0x20080800L,0x20080810L,\r
+0x00090800L,0x00090810L,0x20090800L,0x20090810L,\r
+0x00080020L,0x00080030L,0x20080020L,0x20080030L,\r
+0x00090020L,0x00090030L,0x20090020L,0x20090030L,\r
+0x00080820L,0x00080830L,0x20080820L,0x20080830L,\r
+0x00090820L,0x00090830L,0x20090820L,0x20090830L,\r
+},{\r
+/* for C bits (numbered as per FIPS 46) 7 8 10 11 12 13 */\r
+0x00000000L,0x02000000L,0x00002000L,0x02002000L,\r
+0x00200000L,0x02200000L,0x00202000L,0x02202000L,\r
+0x00000004L,0x02000004L,0x00002004L,0x02002004L,\r
+0x00200004L,0x02200004L,0x00202004L,0x02202004L,\r
+0x00000400L,0x02000400L,0x00002400L,0x02002400L,\r
+0x00200400L,0x02200400L,0x00202400L,0x02202400L,\r
+0x00000404L,0x02000404L,0x00002404L,0x02002404L,\r
+0x00200404L,0x02200404L,0x00202404L,0x02202404L,\r
+0x10000000L,0x12000000L,0x10002000L,0x12002000L,\r
+0x10200000L,0x12200000L,0x10202000L,0x12202000L,\r
+0x10000004L,0x12000004L,0x10002004L,0x12002004L,\r
+0x10200004L,0x12200004L,0x10202004L,0x12202004L,\r
+0x10000400L,0x12000400L,0x10002400L,0x12002400L,\r
+0x10200400L,0x12200400L,0x10202400L,0x12202400L,\r
+0x10000404L,0x12000404L,0x10002404L,0x12002404L,\r
+0x10200404L,0x12200404L,0x10202404L,0x12202404L,\r
+},{\r
+/* for C bits (numbered as per FIPS 46) 14 15 16 17 19 20 */\r
+0x00000000L,0x00000001L,0x00040000L,0x00040001L,\r
+0x01000000L,0x01000001L,0x01040000L,0x01040001L,\r
+0x00000002L,0x00000003L,0x00040002L,0x00040003L,\r
+0x01000002L,0x01000003L,0x01040002L,0x01040003L,\r
+0x00000200L,0x00000201L,0x00040200L,0x00040201L,\r
+0x01000200L,0x01000201L,0x01040200L,0x01040201L,\r
+0x00000202L,0x00000203L,0x00040202L,0x00040203L,\r
+0x01000202L,0x01000203L,0x01040202L,0x01040203L,\r
+0x08000000L,0x08000001L,0x08040000L,0x08040001L,\r
+0x09000000L,0x09000001L,0x09040000L,0x09040001L,\r
+0x08000002L,0x08000003L,0x08040002L,0x08040003L,\r
+0x09000002L,0x09000003L,0x09040002L,0x09040003L,\r
+0x08000200L,0x08000201L,0x08040200L,0x08040201L,\r
+0x09000200L,0x09000201L,0x09040200L,0x09040201L,\r
+0x08000202L,0x08000203L,0x08040202L,0x08040203L,\r
+0x09000202L,0x09000203L,0x09040202L,0x09040203L,\r
+},{\r
+/* for C bits (numbered as per FIPS 46) 21 23 24 26 27 28 */\r
+0x00000000L,0x00100000L,0x00000100L,0x00100100L,\r
+0x00000008L,0x00100008L,0x00000108L,0x00100108L,\r
+0x00001000L,0x00101000L,0x00001100L,0x00101100L,\r
+0x00001008L,0x00101008L,0x00001108L,0x00101108L,\r
+0x04000000L,0x04100000L,0x04000100L,0x04100100L,\r
+0x04000008L,0x04100008L,0x04000108L,0x04100108L,\r
+0x04001000L,0x04101000L,0x04001100L,0x04101100L,\r
+0x04001008L,0x04101008L,0x04001108L,0x04101108L,\r
+0x00020000L,0x00120000L,0x00020100L,0x00120100L,\r
+0x00020008L,0x00120008L,0x00020108L,0x00120108L,\r
+0x00021000L,0x00121000L,0x00021100L,0x00121100L,\r
+0x00021008L,0x00121008L,0x00021108L,0x00121108L,\r
+0x04020000L,0x04120000L,0x04020100L,0x04120100L,\r
+0x04020008L,0x04120008L,0x04020108L,0x04120108L,\r
+0x04021000L,0x04121000L,0x04021100L,0x04121100L,\r
+0x04021008L,0x04121008L,0x04021108L,0x04121108L,\r
+},{\r
+/* for D bits (numbered as per FIPS 46) 1 2 3 4 5 6 */\r
+0x00000000L,0x10000000L,0x00010000L,0x10010000L,\r
+0x00000004L,0x10000004L,0x00010004L,0x10010004L,\r
+0x20000000L,0x30000000L,0x20010000L,0x30010000L,\r
+0x20000004L,0x30000004L,0x20010004L,0x30010004L,\r
+0x00100000L,0x10100000L,0x00110000L,0x10110000L,\r
+0x00100004L,0x10100004L,0x00110004L,0x10110004L,\r
+0x20100000L,0x30100000L,0x20110000L,0x30110000L,\r
+0x20100004L,0x30100004L,0x20110004L,0x30110004L,\r
+0x00001000L,0x10001000L,0x00011000L,0x10011000L,\r
+0x00001004L,0x10001004L,0x00011004L,0x10011004L,\r
+0x20001000L,0x30001000L,0x20011000L,0x30011000L,\r
+0x20001004L,0x30001004L,0x20011004L,0x30011004L,\r
+0x00101000L,0x10101000L,0x00111000L,0x10111000L,\r
+0x00101004L,0x10101004L,0x00111004L,0x10111004L,\r
+0x20101000L,0x30101000L,0x20111000L,0x30111000L,\r
+0x20101004L,0x30101004L,0x20111004L,0x30111004L,\r
+},{\r
+/* for D bits (numbered as per FIPS 46) 8 9 11 12 13 14 */\r
+0x00000000L,0x08000000L,0x00000008L,0x08000008L,\r
+0x00000400L,0x08000400L,0x00000408L,0x08000408L,\r
+0x00020000L,0x08020000L,0x00020008L,0x08020008L,\r
+0x00020400L,0x08020400L,0x00020408L,0x08020408L,\r
+0x00000001L,0x08000001L,0x00000009L,0x08000009L,\r
+0x00000401L,0x08000401L,0x00000409L,0x08000409L,\r
+0x00020001L,0x08020001L,0x00020009L,0x08020009L,\r
+0x00020401L,0x08020401L,0x00020409L,0x08020409L,\r
+0x02000000L,0x0A000000L,0x02000008L,0x0A000008L,\r
+0x02000400L,0x0A000400L,0x02000408L,0x0A000408L,\r
+0x02020000L,0x0A020000L,0x02020008L,0x0A020008L,\r
+0x02020400L,0x0A020400L,0x02020408L,0x0A020408L,\r
+0x02000001L,0x0A000001L,0x02000009L,0x0A000009L,\r
+0x02000401L,0x0A000401L,0x02000409L,0x0A000409L,\r
+0x02020001L,0x0A020001L,0x02020009L,0x0A020009L,\r
+0x02020401L,0x0A020401L,0x02020409L,0x0A020409L,\r
+},{\r
+/* for D bits (numbered as per FIPS 46) 16 17 18 19 20 21 */\r
+0x00000000L,0x00000100L,0x00080000L,0x00080100L,\r
+0x01000000L,0x01000100L,0x01080000L,0x01080100L,\r
+0x00000010L,0x00000110L,0x00080010L,0x00080110L,\r
+0x01000010L,0x01000110L,0x01080010L,0x01080110L,\r
+0x00200000L,0x00200100L,0x00280000L,0x00280100L,\r
+0x01200000L,0x01200100L,0x01280000L,0x01280100L,\r
+0x00200010L,0x00200110L,0x00280010L,0x00280110L,\r
+0x01200010L,0x01200110L,0x01280010L,0x01280110L,\r
+0x00000200L,0x00000300L,0x00080200L,0x00080300L,\r
+0x01000200L,0x01000300L,0x01080200L,0x01080300L,\r
+0x00000210L,0x00000310L,0x00080210L,0x00080310L,\r
+0x01000210L,0x01000310L,0x01080210L,0x01080310L,\r
+0x00200200L,0x00200300L,0x00280200L,0x00280300L,\r
+0x01200200L,0x01200300L,0x01280200L,0x01280300L,\r
+0x00200210L,0x00200310L,0x00280210L,0x00280310L,\r
+0x01200210L,0x01200310L,0x01280210L,0x01280310L,\r
+},{\r
+/* for D bits (numbered as per FIPS 46) 22 23 24 25 27 28 */\r
+0x00000000L,0x04000000L,0x00040000L,0x04040000L,\r
+0x00000002L,0x04000002L,0x00040002L,0x04040002L,\r
+0x00002000L,0x04002000L,0x00042000L,0x04042000L,\r
+0x00002002L,0x04002002L,0x00042002L,0x04042002L,\r
+0x00000020L,0x04000020L,0x00040020L,0x04040020L,\r
+0x00000022L,0x04000022L,0x00040022L,0x04040022L,\r
+0x00002020L,0x04002020L,0x00042020L,0x04042020L,\r
+0x00002022L,0x04002022L,0x00042022L,0x04042022L,\r
+0x00000800L,0x04000800L,0x00040800L,0x04040800L,\r
+0x00000802L,0x04000802L,0x00040802L,0x04040802L,\r
+0x00002800L,0x04002800L,0x00042800L,0x04042800L,\r
+0x00002802L,0x04002802L,0x00042802L,0x04042802L,\r
+0x00000820L,0x04000820L,0x00040820L,0x04040820L,\r
+0x00000822L,0x04000822L,0x00040822L,0x04040822L,\r
+0x00002820L,0x04002820L,0x00042820L,0x04042820L,\r
+0x00002822L,0x04002822L,0x00042822L,0x04042822L,\r
+}};\r
+\r
+__device__ __constant__ unsigned int des_SPtrans[8][64] = {\r
+{\r
+/* nibble 0 */\r
+0x02080800L, 0x00080000L, 0x02000002L, 0x02080802L,\r
+0x02000000L, 0x00080802L, 0x00080002L, 0x02000002L,\r
+0x00080802L, 0x02080800L, 0x02080000L, 0x00000802L,\r
+0x02000802L, 0x02000000L, 0x00000000L, 0x00080002L,\r
+0x00080000L, 0x00000002L, 0x02000800L, 0x00080800L,\r
+0x02080802L, 0x02080000L, 0x00000802L, 0x02000800L,\r
+0x00000002L, 0x00000800L, 0x00080800L, 0x02080002L,\r
+0x00000800L, 0x02000802L, 0x02080002L, 0x00000000L,\r
+0x00000000L, 0x02080802L, 0x02000800L, 0x00080002L,\r
+0x02080800L, 0x00080000L, 0x00000802L, 0x02000800L,\r
+0x02080002L, 0x00000800L, 0x00080800L, 0x02000002L,\r
+0x00080802L, 0x00000002L, 0x02000002L, 0x02080000L,\r
+0x02080802L, 0x00080800L, 0x02080000L, 0x02000802L,\r
+0x02000000L, 0x00000802L, 0x00080002L, 0x00000000L,\r
+0x00080000L, 0x02000000L, 0x02000802L, 0x02080800L,\r
+0x00000002L, 0x02080002L, 0x00000800L, 0x00080802L,\r
+},{\r
+/* nibble 1 */\r
+0x40108010L, 0x00000000L, 0x00108000L, 0x40100000L,\r
+0x40000010L, 0x00008010L, 0x40008000L, 0x00108000L,\r
+0x00008000L, 0x40100010L, 0x00000010L, 0x40008000L,\r
+0x00100010L, 0x40108000L, 0x40100000L, 0x00000010L,\r
+0x00100000L, 0x40008010L, 0x40100010L, 0x00008000L,\r
+0x00108010L, 0x40000000L, 0x00000000L, 0x00100010L,\r
+0x40008010L, 0x00108010L, 0x40108000L, 0x40000010L,\r
+0x40000000L, 0x00100000L, 0x00008010L, 0x40108010L,\r
+0x00100010L, 0x40108000L, 0x40008000L, 0x00108010L,\r
+0x40108010L, 0x00100010L, 0x40000010L, 0x00000000L,\r
+0x40000000L, 0x00008010L, 0x00100000L, 0x40100010L,\r
+0x00008000L, 0x40000000L, 0x00108010L, 0x40008010L,\r
+0x40108000L, 0x00008000L, 0x00000000L, 0x40000010L,\r
+0x00000010L, 0x40108010L, 0x00108000L, 0x40100000L,\r
+0x40100010L, 0x00100000L, 0x00008010L, 0x40008000L,\r
+0x40008010L, 0x00000010L, 0x40100000L, 0x00108000L,\r
+},{\r
+/* nibble 2 */\r
+0x04000001L, 0x04040100L, 0x00000100L, 0x04000101L,\r
+0x00040001L, 0x04000000L, 0x04000101L, 0x00040100L,\r
+0x04000100L, 0x00040000L, 0x04040000L, 0x00000001L,\r
+0x04040101L, 0x00000101L, 0x00000001L, 0x04040001L,\r
+0x00000000L, 0x00040001L, 0x04040100L, 0x00000100L,\r
+0x00000101L, 0x04040101L, 0x00040000L, 0x04000001L,\r
+0x04040001L, 0x04000100L, 0x00040101L, 0x04040000L,\r
+0x00040100L, 0x00000000L, 0x04000000L, 0x00040101L,\r
+0x04040100L, 0x00000100L, 0x00000001L, 0x00040000L,\r
+0x00000101L, 0x00040001L, 0x04040000L, 0x04000101L,\r
+0x00000000L, 0x04040100L, 0x00040100L, 0x04040001L,\r
+0x00040001L, 0x04000000L, 0x04040101L, 0x00000001L,\r
+0x00040101L, 0x04000001L, 0x04000000L, 0x04040101L,\r
+0x00040000L, 0x04000100L, 0x04000101L, 0x00040100L,\r
+0x04000100L, 0x00000000L, 0x04040001L, 0x00000101L,\r
+0x04000001L, 0x00040101L, 0x00000100L, 0x04040000L,\r
+},{\r
+/* nibble 3 */\r
+0x00401008L, 0x10001000L, 0x00000008L, 0x10401008L,\r
+0x00000000L, 0x10400000L, 0x10001008L, 0x00400008L,\r
+0x10401000L, 0x10000008L, 0x10000000L, 0x00001008L,\r
+0x10000008L, 0x00401008L, 0x00400000L, 0x10000000L,\r
+0x10400008L, 0x00401000L, 0x00001000L, 0x00000008L,\r
+0x00401000L, 0x10001008L, 0x10400000L, 0x00001000L,\r
+0x00001008L, 0x00000000L, 0x00400008L, 0x10401000L,\r
+0x10001000L, 0x10400008L, 0x10401008L, 0x00400000L,\r
+0x10400008L, 0x00001008L, 0x00400000L, 0x10000008L,\r
+0x00401000L, 0x10001000L, 0x00000008L, 0x10400000L,\r
+0x10001008L, 0x00000000L, 0x00001000L, 0x00400008L,\r
+0x00000000L, 0x10400008L, 0x10401000L, 0x00001000L,\r
+0x10000000L, 0x10401008L, 0x00401008L, 0x00400000L,\r
+0x10401008L, 0x00000008L, 0x10001000L, 0x00401008L,\r
+0x00400008L, 0x00401000L, 0x10400000L, 0x10001008L,\r
+0x00001008L, 0x10000000L, 0x10000008L, 0x10401000L,\r
+},{\r
+/* nibble 4 */\r
+0x08000000L, 0x00010000L, 0x00000400L, 0x08010420L,\r
+0x08010020L, 0x08000400L, 0x00010420L, 0x08010000L,\r
+0x00010000L, 0x00000020L, 0x08000020L, 0x00010400L,\r
+0x08000420L, 0x08010020L, 0x08010400L, 0x00000000L,\r
+0x00010400L, 0x08000000L, 0x00010020L, 0x00000420L,\r
+0x08000400L, 0x00010420L, 0x00000000L, 0x08000020L,\r
+0x00000020L, 0x08000420L, 0x08010420L, 0x00010020L,\r
+0x08010000L, 0x00000400L, 0x00000420L, 0x08010400L,\r
+0x08010400L, 0x08000420L, 0x00010020L, 0x08010000L,\r
+0x00010000L, 0x00000020L, 0x08000020L, 0x08000400L,\r
+0x08000000L, 0x00010400L, 0x08010420L, 0x00000000L,\r
+0x00010420L, 0x08000000L, 0x00000400L, 0x00010020L,\r
+0x08000420L, 0x00000400L, 0x00000000L, 0x08010420L,\r
+0x08010020L, 0x08010400L, 0x00000420L, 0x00010000L,\r
+0x00010400L, 0x08010020L, 0x08000400L, 0x00000420L,\r
+0x00000020L, 0x00010420L, 0x08010000L, 0x08000020L,\r
+},{\r
+/* nibble 5 */\r
+0x80000040L, 0x00200040L, 0x00000000L, 0x80202000L,\r
+0x00200040L, 0x00002000L, 0x80002040L, 0x00200000L,\r
+0x00002040L, 0x80202040L, 0x00202000L, 0x80000000L,\r
+0x80002000L, 0x80000040L, 0x80200000L, 0x00202040L,\r
+0x00200000L, 0x80002040L, 0x80200040L, 0x00000000L,\r
+0x00002000L, 0x00000040L, 0x80202000L, 0x80200040L,\r
+0x80202040L, 0x80200000L, 0x80000000L, 0x00002040L,\r
+0x00000040L, 0x00202000L, 0x00202040L, 0x80002000L,\r
+0x00002040L, 0x80000000L, 0x80002000L, 0x00202040L,\r
+0x80202000L, 0x00200040L, 0x00000000L, 0x80002000L,\r
+0x80000000L, 0x00002000L, 0x80200040L, 0x00200000L,\r
+0x00200040L, 0x80202040L, 0x00202000L, 0x00000040L,\r
+0x80202040L, 0x00202000L, 0x00200000L, 0x80002040L,\r
+0x80000040L, 0x80200000L, 0x00202040L, 0x00000000L,\r
+0x00002000L, 0x80000040L, 0x80002040L, 0x80202000L,\r
+0x80200000L, 0x00002040L, 0x00000040L, 0x80200040L,\r
+},{\r
+/* nibble 6 */\r
+0x00004000L, 0x00000200L, 0x01000200L, 0x01000004L,\r
+0x01004204L, 0x00004004L, 0x00004200L, 0x00000000L,\r
+0x01000000L, 0x01000204L, 0x00000204L, 0x01004000L,\r
+0x00000004L, 0x01004200L, 0x01004000L, 0x00000204L,\r
+0x01000204L, 0x00004000L, 0x00004004L, 0x01004204L,\r
+0x00000000L, 0x01000200L, 0x01000004L, 0x00004200L,\r
+0x01004004L, 0x00004204L, 0x01004200L, 0x00000004L,\r
+0x00004204L, 0x01004004L, 0x00000200L, 0x01000000L,\r
+0x00004204L, 0x01004000L, 0x01004004L, 0x00000204L,\r
+0x00004000L, 0x00000200L, 0x01000000L, 0x01004004L,\r
+0x01000204L, 0x00004204L, 0x00004200L, 0x00000000L,\r
+0x00000200L, 0x01000004L, 0x00000004L, 0x01000200L,\r
+0x00000000L, 0x01000204L, 0x01000200L, 0x00004200L,\r
+0x00000204L, 0x00004000L, 0x01004204L, 0x01000000L,\r
+0x01004200L, 0x00000004L, 0x00004004L, 0x01004204L,\r
+0x01000004L, 0x01004200L, 0x01004000L, 0x00004004L,\r
+},{\r
+/* nibble 7 */\r
+0x20800080L, 0x20820000L, 0x00020080L, 0x00000000L,\r
+0x20020000L, 0x00800080L, 0x20800000L, 0x20820080L,\r
+0x00000080L, 0x20000000L, 0x00820000L, 0x00020080L,\r
+0x00820080L, 0x20020080L, 0x20000080L, 0x20800000L,\r
+0x00020000L, 0x00820080L, 0x00800080L, 0x20020000L,\r
+0x20820080L, 0x20000080L, 0x00000000L, 0x00820000L,\r
+0x20000000L, 0x00800000L, 0x20020080L, 0x20800080L,\r
+0x00800000L, 0x00020000L, 0x20820000L, 0x00000080L,\r
+0x00800000L, 0x00020000L, 0x20000080L, 0x20820080L,\r
+0x00020080L, 0x20000000L, 0x00000000L, 0x00820000L,\r
+0x20800080L, 0x20020080L, 0x20020000L, 0x00800080L,\r
+0x20820000L, 0x00000080L, 0x00800080L, 0x20020000L,\r
+0x20820080L, 0x00800000L, 0x20800000L, 0x20000080L,\r
+0x00820000L, 0x00020080L, 0x20020080L, 0x20800000L,\r
+0x00000080L, 0x20820000L, 0x00820080L, 0x00000000L,\r
+0x20000000L, 0x20800080L, 0x00020000L, 0x00820080L,\r
+}};\r
+\r
+\r
+__device__ void PERM_OP(int ia, int ib, int it, unsigned int n, unsigned int m, unsigned int* data) {\r
+       data[it] =((data[ia] >> n ) ^ data[ib]) & m;\r
+       data[ib] ^= data[it];\r
+       data[ia] ^= data[it] << n;\r
+}\r
+\r
+__device__ void HPERM_OP(int ia, int it, int n, unsigned int m, unsigned int* data) {\r
+       data[it] = ((data[ia] << (16-n)) ^ data[ia]) & m;\r
+       data[ia] = data[ia] ^ data[it] ^ (data[it]>>(16-n));\r
+}\r
+\r
+__device__ void IP(int il, int ir, int it, unsigned int* data) {\r
+       PERM_OP(ir, il, it, 4, 0x0f0f0f0f, data);\r
+       PERM_OP(il, ir, it, 16, 0x0000ffff, data);\r
+       PERM_OP(ir, il, it, 2, 0x33333333, data);\r
+       PERM_OP(il, ir, it, 8, 0x00ff00ff, data);\r
+       PERM_OP(ir, il, it, 1, 0x55555555, data);\r
+}\r
+\r
+__device__ void FP(int il, int ir, int it, unsigned int* data) {\r
+       PERM_OP(il, ir, it, 1, 0x55555555, data);\r
+       PERM_OP(ir, il, it, 8, 0x00ff00ff, data);\r
+       PERM_OP(il, ir, it, 2, 0x33333333, data);\r
+       PERM_OP(ir, il, it, 16, 0x0000ffff, data);\r
+       PERM_OP(il, ir, it, 4, 0x0f0f0f0f, data);\r
+}\r
+\r
+__device__ unsigned int D_ENCRYPT(unsigned int ll, unsigned int uu, unsigned int tt) {\r
+       tt = (tt>>4)|(tt<<28);\r
+       return ll ^ des_SPtrans[0][(uu>>2)&0x3f] ^\r
+                       des_SPtrans[2][(uu>>10)&0x3f] ^\r
+                       des_SPtrans[4][(uu>>18)&0x3f] ^\r
+                       des_SPtrans[6][(uu>>26)&0x3f] ^\r
+                       des_SPtrans[1][(tt>>2)&0x3f] ^\r
+                       des_SPtrans[3][(tt>>10)&0x3f] ^\r
+                       des_SPtrans[5][(tt>>18)&0x3f] ^\r
+                       des_SPtrans[7][(tt>>26)&0x3f];\r
+}\r
+\r
+}\r
+\r
+__global__ void RTGenLMKernel(unsigned int chainStart, unsigned int chainStop) {\r
+       uint3 dimItem;\r
+       unsigned int *hData2, *hData3;\r
+       uint64 uiDiv64, uiVal64, uiMul64;\r
+       unsigned int uiVal, uiDiv;\r
+       unsigned int jj, rs, rt;\r
+\r
+       __shared__ unsigned int shData2[SHIDX(BLOCK_X_SIZE)];\r
+       __shared__ unsigned int shData3[BLOCK_X_SIZE<<2];\r
+       hData2 = shData2 + ((threadIdx.x>>4)<<8)+(threadIdx.x&15);\r
+       hData3 = shData3 + ((threadIdx.x>>4)<<6)+(threadIdx.x&15);\r
+\r
+       RTGEN_PROLOGUE;\r
+\r
+       // transform to the plain text\r
+       for(ii = 0; ii < 8; ii++)\r
+               hData[SHIDX(ii)] = 0;\r
+       \r
+       for(ii = 0; idx64 > 0xfffffff0ull && ii < PLAIN_MAX_SIZE; ii++) {\r
+               uiVal64 = idx64 + cplStart[ii];\r
+               uiVal64--;\r
+               dimItem = cplDimVec[ii];\r
+\r
+               uiMul64 = (uint64)dimItem.y<<32;\r
+               idx64 = __umul64hi(uiVal64, uiMul64);\r
+               uiDiv64 = uiVal64 - idx64*(uint64)dimItem.x;\r
+               uiVal = __umulhi((unsigned int)uiDiv64, dimItem.y);\r
+               uiDiv = (unsigned int)uiDiv64 - uiVal * dimItem.x;\r
+               idx64 += uiVal;\r
+               if(uiDiv >= dimItem.x) {\r
+                       uiDiv -= dimItem.x;\r
+                       idx64++;\r
+               }\r
+               hData[SHIDX(ii&7)] = cplChrSet[dimItem.z + uiDiv];\r
+       }\r
+\r
+       for(idx = (unsigned int)idx64; idx != 0 && ii < PLAIN_MAX_SIZE; ii++) {\r
+               uiVal = idx + cplStart[ii];\r
+               uiVal--;\r
+               dimItem = cplDimVec[ii];\r
+\r
+               idx = __umulhi(uiVal, dimItem.y);\r
+               uiDiv = uiVal - idx*dimItem.x;\r
+               if(uiDiv >= dimItem.x) {\r
+                       uiDiv -= dimItem.x;\r
+                       idx++;\r
+               }\r
+               hData[SHIDX(ii&7)] = cplChrSet[dimItem.z + uiDiv];\r
+       }\r
+\r
+       for(jj = 8, ii--; jj < 15; jj++, ii--)\r
+               hData[SHIDX(jj)] = hData[SHIDX(ii&7)];\r
+\r
+       // set key\r
+       ii = 255;\r
+       uiVal = ((hData[SHIDX(10)] << 5) | (hData[SHIDX(11)] >> 3))&ii;\r
+       uiVal = (uiVal<<8) | (((hData[SHIDX(9)] << 6) | (hData[SHIDX(10)] >> 2))&ii);\r
+       uiVal = (uiVal<<8) | (((hData[SHIDX(8)] << 7) | (hData[SHIDX(9)] >> 1))&ii);\r
+       uiVal = (uiVal<<8) | hData[SHIDX(8)];\r
+       \r
+       uiDiv = (hData[SHIDX(14)] << 1)&ii;\r
+       uiDiv = (uiDiv<<8) | (((hData[SHIDX(13)] << 2) | (hData[SHIDX(14)] >> 6))&ii);\r
+       uiDiv = (uiDiv<<8) | (((hData[SHIDX(12)] << 3) | (hData[SHIDX(13)] >> 5))&ii);\r
+       uiDiv = (uiDiv<<8) | (((hData[SHIDX(11)] << 4) | (hData[SHIDX(12)] >> 4))&ii);\r
+\r
+       hData[SHIDX(0)] = uiVal;\r
+       hData[SHIDX(1)] = uiDiv;\r
+       RC_LM::PERM_OP(SHIDX(1), SHIDX(0), SHIDX(2), 4, 0x0f0f0f0f, hData);\r
+       RC_LM::HPERM_OP(SHIDX(0), SHIDX(2), -2, 0xcccc0000, hData);\r
+       RC_LM::HPERM_OP(SHIDX(1), SHIDX(2), -2, 0xcccc0000, hData);\r
+       RC_LM::PERM_OP(SHIDX(1), SHIDX(0), SHIDX(2), 1, 0x55555555, hData);\r
+       RC_LM::PERM_OP(SHIDX(0), SHIDX(1), SHIDX(2), 8, 0x00ff00ff, hData);\r
+       RC_LM::PERM_OP(SHIDX(1), SHIDX(0), SHIDX(2), 1, 0x55555555, hData);\r
+       uiVal = hData[SHIDX(0)];\r
+       uiDiv = hData[SHIDX(1)];\r
+       uiDiv = ((uiDiv&0x000000ff)<<16) | (uiDiv&0x0000ff00) | ((uiDiv&0x00ff0000)>>16) | ((uiVal&0xf0000000)>>4);\r
+       uiVal &= 0x0fffffff;\r
+\r
+       for(ii = 0; ii < ITERATIONS; ii++) {\r
+               if(RC_LM::shifts2[ii]) { \r
+                       uiVal = ((uiVal>>2)|(uiVal<<26)); \r
+                       uiDiv =((uiDiv>>2)|(uiDiv<<26)); \r
+               } else { \r
+                       uiVal = ((uiVal>>1)|(uiVal<<27)); \r
+                       uiDiv = ((uiDiv>>1)|(uiDiv<<27)); \r
+               }\r
+               uiVal &= 0x0fffffff;\r
+               uiDiv &= 0x0fffffff;\r
+\r
+               rs = RC_LM::des_skb[0][uiVal&0x3f] |\r
+                       RC_LM::des_skb[1][((uiVal>>6)&0x03)|((uiVal>>7)&0x3c)] |\r
+                       RC_LM::des_skb[2][((uiVal>>13)&0x0f)|((uiVal>>14)&0x30)] |\r
+                       RC_LM::des_skb[3][((uiVal>>20)&0x01)|((uiVal>>21)&0x06) |\r
+                       ((uiVal>>22)&0x38)];\r
+               rt = RC_LM::des_skb[4][uiDiv&0x3f] |\r
+                       RC_LM::des_skb[5][((uiDiv>>7)&0x03)|((uiDiv>>8)&0x3c)] |\r
+                       RC_LM::des_skb[6][(uiDiv>>15)&0x3f] |\r
+                       RC_LM::des_skb[7][((uiDiv>>21)&0x0f)|((uiDiv>>22)&0x30)];\r
+\r
+               /* table contained 0213 4657 */\r
+               idx = (rt<<16)|(rs&0x0000ffff);\r
+               hData[SHIDX(ii)] = (idx>>30)|(idx<<2);\r
+               idx = (rs>>16)|(rt&0xffff0000);\r
+               hData2[SHIDX(ii)] = (idx>>26)|(idx<<6);\r
+       }\r
+\r
+       // encrypt the "magic" data\r
+       hData3[SHIDX(0)] = 0x2153474B;\r
+       hData3[SHIDX(1)] = 0x25242340;\r
+\r
+       RC_LM::IP(SHIDX(0), SHIDX(1), SHIDX(2), hData3);\r
+       uiVal = hData3[SHIDX(0)];\r
+       uiVal = ((uiVal>>29)|(uiVal<<3));\r
+       uiDiv = hData3[SHIDX(1)];\r
+       uiDiv = ((uiDiv>>29)|(uiDiv<<3));\r
+\r
+       for(ii = 0; ii < 16; ii+=2) {\r
+               uiDiv = RC_LM::D_ENCRYPT(uiDiv, uiVal^hData[SHIDX(ii)], uiVal^hData2[SHIDX(ii)]);\r
+               uiVal = RC_LM::D_ENCRYPT(uiVal, uiDiv^hData[SHIDX(ii+1)], uiDiv^hData2[SHIDX(ii+1)]);\r
+       }\r
+\r
+       hData3[SHIDX(0)] = ((uiVal>>3)|(uiVal<<29));\r
+       hData3[SHIDX(1)] = ((uiDiv>>3)|(uiDiv<<29));\r
+       RC_LM::FP(SHIDX(0), SHIDX(1), SHIDX(2), hData3);\r
+       \r
+       hData[SHIDX(0)] = hData3[SHIDX(1)];\r
+       hData[SHIDX(1)] = hData3[SHIDX(0)];\r
+       RTGEN_EPILOGUE\r
+}\r
diff --git a/BOINC software/BOINC client apps/distrrtgen_cuda/rcuda_md4.inc b/BOINC software/BOINC client apps/distrrtgen_cuda/rcuda_md4.inc
new file mode 100644 (file)
index 0000000..362e608
--- /dev/null
@@ -0,0 +1,143 @@
+//============================================================================\r
+// Name        : rcuda_md4.inc\r
+// Author      : Jan Kyska\r
+// Version     : 1.00\r
+// Description : MD4 hash kernel for Generator of FreeRainbowTables\r
+//============================================================================ \r
+\r
+namespace RC_MD4 {\r
+\r
+__device__ __constant__ unsigned int h[4] = { 0x67452301, 0xEFCDAB89, 0x98BADCFE, 0x10325476 };\r
+__device__ __constant__ unsigned char r[48] = { \\r
+                                                       3,  7, 11, 19,  3,  7, 11, 19,  3,  7, 11, 19,  3,  7, 11, 19, \\r
+                                                       3,  5,  9, 13,  3,  5,  9, 13,  3,  5,  9, 13,  3,  5,  9, 13, \\r
+                                                       3,  9, 11, 15,  3,  9, 11, 15,  3,  9, 11, 15,  3,  9, 11, 15 };\r
+__device__ __constant__ unsigned char g[48] = { \\r
+                                                       0, 1,  2,  3,   4,  5,  6,  7,   8,  9, 10, 11,  12, 13, 14, 15, \\r
+                                                       0, 4,  8, 12,   1,  5,  9, 13,   2,  6, 10, 14,   3,  7, 11, 15, \\r
+                                                       0, 8,  4, 12,   2, 10,  6, 14,   1,  9,  5, 13,   3, 11,  7, 15 };\r
+\r
+__device__ unsigned int FF(unsigned int a, unsigned int b, unsigned int c, unsigned int d, int i, const unsigned int* data) {\r
+       unsigned int ret;\r
+       ret = a + ((b&c)|((~b)&d)) + data[SHIDX(g[i])];\r
+       ret = (ret<<r[i])|(ret>>(32-r[i]));\r
+       return ret;\r
+}\r
+\r
+__device__ unsigned int GG(unsigned int a, unsigned int b, unsigned int c, unsigned int d, int i, const unsigned int* data) {\r
+       unsigned int ret;\r
+       ret = a + ((b&c)|(b&d)|(c&d)) + data[SHIDX(g[i])] + 0x5a827999u;\r
+       ret = (ret<<r[i])|(ret>>(32-r[i]));\r
+       return ret;\r
+}\r
+\r
+__device__ unsigned int HH(unsigned int a, unsigned int b, unsigned int c, unsigned int d, int i, const unsigned int* data) {\r
+       unsigned int ret;\r
+       ret = a + (b^c^d) + data[SHIDX(g[i])] + 0x6ed9eba1u;\r
+       ret = (ret<<r[i])|(ret>>(32-r[i]));\r
+       return ret;\r
+}\r
+\r
+__device__ void MD4(unsigned int* dataHash) {\r
+       unsigned int a = h[0], b = h[1], c = h[2], d = h[3], x;\r
+       int ii;\r
+\r
+       // Round 1\r
+       for(ii = 0; ii < 16; ii++) {\r
+               x = b;\r
+               b = FF(a, b, c, d, ii, dataHash);\r
+               a = d; d = c; c = x;\r
+       }\r
+\r
+       // Round 2\r
+       for(; ii < 32; ii++) {\r
+               x = b;\r
+               b = GG(a, b, c, d, ii, dataHash);\r
+               a = d; d = c; c = x;\r
+       }\r
+       \r
+       // Round 3\r
+       for(; ii < 48; ii++) {\r
+               x = b;\r
+               b = HH(a, b, c, d, ii, dataHash);\r
+               a = d; d = c; c = x;\r
+       }\r
+       \r
+       dataHash[SHIDX(0)] = a + h[0];\r
+       dataHash[SHIDX(1)] = b + h[1];\r
+       dataHash[SHIDX(2)] = c + h[2];\r
+       dataHash[SHIDX(3)] = d + h[3];\r
+}\r
+\r
+}\r
+\r
+__global__ void RTGenMD4Kernel(unsigned int chainStart, unsigned int chainStop) {\r
+       uint3 dimItem;\r
+       uint64 uiDiv64, uiVal64, uiMul64;\r
+       unsigned int uiVal, uiDiv;\r
+       unsigned int size, jj;\r
+       unsigned int plain;\r
+\r
+       RTGEN_PROLOGUE;\r
+       \r
+       // transform to the plain text\r
+       plain = 0x80;\r
+       jj = (PLAIN_MAX_SIZE>>2)+1;\r
+\r
+       for(ii = 0; idx64 > 0xfffffff0ull && ii < PLAIN_MAX_SIZE; ii++) {\r
+               uiVal64 = idx64 + cplStart[ii];\r
+               uiVal64--;\r
+               dimItem = cplDimVec[ii];\r
+\r
+               uiMul64 = (uint64)dimItem.y<<32;\r
+               idx64 = __umul64hi(uiVal64, uiMul64);\r
+               uiDiv64 = uiVal64 - idx64*(uint64)dimItem.x;\r
+               uiVal = __umulhi((unsigned int)uiDiv64, dimItem.y);\r
+               uiDiv = (unsigned int)uiDiv64 - uiVal * dimItem.x;\r
+               idx64 += uiVal;\r
+               if(uiDiv >= dimItem.x) {\r
+                       uiDiv -= dimItem.x;\r
+                       idx64++;\r
+               }\r
+               plain = (plain<<8) | cplChrSet[dimItem.z + uiDiv];\r
+               if((ii&3) == 2) {\r
+                       hData[SHIDX(jj--)] = plain;\r
+                       plain = 0;\r
+               }\r
+       }\r
+\r
+       for(idx = (unsigned int)idx64; idx != 0 && ii < PLAIN_MAX_SIZE; ii++) {\r
+               uiVal = idx + cplStart[ii];\r
+               uiVal--;\r
+               dimItem = cplDimVec[ii];\r
+\r
+               idx = __umulhi(uiVal, dimItem.y);\r
+               uiDiv = uiVal - idx*dimItem.x;\r
+               if(uiDiv >= dimItem.x) {\r
+                       uiDiv -= dimItem.x;\r
+                       idx++;\r
+               }\r
+               plain = (plain<<8) | cplChrSet[dimItem.z + uiDiv];\r
+               if((ii&3) == 2) {\r
+                       hData[SHIDX(jj--)] = plain;\r
+                       plain = 0;\r
+               }\r
+       }\r
+\r
+       // prepare for MD4\r
+       size = ii;\r
+       ii = ((((3-(ii&3))<<3)-1)&0x1f)+1;\r
+       plain = plain<<ii;\r
+       for(jj++, idx = 0; jj <= (PLAIN_MAX_SIZE>>2)+1; plain = hData[SHIDX(jj++)], idx++)\r
+               hData[SHIDX(idx)] = (plain>>ii)|(hData[SHIDX(jj)]<<(32-ii));\r
+       hData[SHIDX(idx)] = plain>>ii;\r
+       for(idx++; idx < 14; idx++)\r
+               hData[SHIDX(idx)] = 0;\r
+       hData[SHIDX(idx++)] = size<<3;\r
+       hData[SHIDX(idx)] = 0;\r
+\r
+       // hash\r
+       RC_MD4::MD4(hData);\r
+\r
+       RTGEN_EPILOGUE\r
+}\r
diff --git a/BOINC software/BOINC client apps/distrrtgen_cuda/rcuda_md5.inc b/BOINC software/BOINC client apps/distrrtgen_cuda/rcuda_md5.inc
new file mode 100644 (file)
index 0000000..77d8696
--- /dev/null
@@ -0,0 +1,183 @@
+//============================================================================\r
+// Name        : rcuda_md5.inc\r
+// Author      : Jan Kyska\r
+// Version     : 1.00\r
+// Description : MD5 hash kernel for Generator of FreeRainbowTables\r
+//============================================================================ \r
+\r
+namespace RC_MD5 {\r
+\r
+__device__ __constant__ unsigned int h[4] = { 0x67452301, 0xEFCDAB89, 0x98BADCFE, 0x10325476 };\r
+__device__ __constant__ unsigned char r[64] = { \\r
+                                                       7, 12, 17, 22,  7, 12, 17, 22,  7, 12, 17, 22,  7, 12, 17, 22, \\r
+                                                       5,  9, 14, 20,  5,  9, 14, 20,  5,  9, 14, 20,  5,  9, 14, 20, \\r
+                                                       4, 11, 16, 23,  4, 11, 16, 23,  4, 11, 16, 23,  4, 11, 16, 23, \\r
+                                                       6, 10, 15, 21,  6, 10, 15, 21,  6, 10, 15, 21,  6, 10, 15, 21 };\r
+__device__ __constant__ unsigned char g[64] = { \\r
+                                                       0, 1,  2,  3,   4,  5,  6,  7,   8,  9, 10, 11,  12, 13, 14, 15, \\r
+                                                       1, 6, 11,  0,   5, 10, 15,  4,   9, 14,  3,  8,  13,  2,  7, 12, \\r
+                                                       5, 8, 11, 14,   1,  4,  7, 10,  13,  0,  3,  6,   9, 12, 15,  2, \\r
+                                                       0, 7, 14,  5,  12,  3, 10,  1,   8, 15,  6, 13,   4, 11,  2,  9 };\r
+__device__ __constant__ unsigned int ac[64] = { \\r
+                                                       0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee, \\r
+                                                       0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501, \\r
+                                                       0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be, \\r
+                                                       0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821, \\r
+                                                       \\r
+                                                       0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa, \\r
+                                                       0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8, \\r
+                                                       0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed, \\r
+                                                       0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a, \\r
+                                                       \\r
+                                                       0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c, \\r
+                                                       0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70, \\r
+                                                       0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05, \\r
+                                                       0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665, \\r
+                                                       \\r
+                                                       0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039, \\r
+                                                       0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1, \\r
+                                                       0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1, \\r
+                                                       0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391 };\r
+\r
+__device__ unsigned int FF(unsigned int a, unsigned int b, unsigned int c, unsigned int d, int i, const unsigned int* data) {\r
+       unsigned int ret;\r
+       ret = a + ((b&c)|((~b)&d)) + ac[i] + data[SHIDX(g[i])];\r
+       ret = (ret<<r[i])|(ret>>(32-r[i]));\r
+       ret += b;\r
+       return ret;\r
+}\r
+\r
+__device__ unsigned int GG(unsigned int a, unsigned int b, unsigned int c, unsigned int d, int i, const unsigned int* data) {\r
+       unsigned int ret;\r
+       ret = a + ((b&d)|(c&(~d))) + ac[i] + data[SHIDX(g[i])];\r
+       ret = (ret<<r[i])|(ret>>(32-r[i]));\r
+       ret += b;\r
+       return ret;\r
+}\r
+\r
+__device__ unsigned int HH(unsigned int a, unsigned int b, unsigned int c, unsigned int d, int i, const unsigned int* data) {\r
+       unsigned int ret;\r
+       ret = a + (b^c^d) + ac[i] + data[SHIDX(g[i])];\r
+       ret = (ret<<r[i])|(ret>>(32-r[i]));\r
+       ret += b;\r
+       return ret;\r
+}\r
+\r
+__device__ unsigned int II(unsigned int a, unsigned int b, unsigned int c, unsigned int d, int i, const unsigned int* data) {\r
+       unsigned int ret;\r
+       ret = a + (c^(b|(~d))) + ac[i] + data[SHIDX(g[i])];\r
+       ret = (ret<<r[i])|(ret>>(32-r[i]));\r
+       ret += b;\r
+       return ret;\r
+}\r
+\r
+__device__ void MD5(unsigned int* dataHash) {\r
+       unsigned int a = h[0], b = h[1], c = h[2], d = h[3], x;\r
+       int ii;\r
+\r
+       // Round 1\r
+       for(ii = 0; ii < 16; ii++) {\r
+               x = b;\r
+               b = FF(a, b, c, d, ii, dataHash);\r
+               a = d; d = c; c = x;\r
+       }\r
+\r
+       // Round 2\r
+       for(; ii < 32; ii++) {\r
+               x = b;\r
+               b = GG(a, b, c, d, ii, dataHash);\r
+               a = d; d = c; c = x;\r
+       }\r
+       \r
+       // Round 3\r
+       for(; ii < 48; ii++) {\r
+               x = b;\r
+               b = HH(a, b, c, d, ii, dataHash);\r
+               a = d; d = c; c = x;\r
+       }\r
+       \r
+       // Round 4\r
+       for(; ii < 64; ii++) {\r
+               x = b;\r
+               b = II(a, b, c, d, ii, dataHash);\r
+               a = d; d = c; c = x;\r
+       }\r
+\r
+       dataHash[SHIDX(0)] = a + h[0];\r
+       dataHash[SHIDX(1)] = b + h[1];\r
+       dataHash[SHIDX(2)] = c + h[2];\r
+       dataHash[SHIDX(3)] = d + h[3];\r
+}\r
+\r
+}\r
+\r
+__global__ void RTGenMD5Kernel(unsigned int chainStart, unsigned int chainStop) {\r
+       uint3 dimItem;\r
+       uint64 uiDiv64, uiVal64, uiMul64;\r
+       unsigned int uiVal, uiDiv;\r
+       unsigned int size, jj;\r
+       unsigned int plain;\r
+\r
+       RTGEN_PROLOGUE;\r
+       \r
+       // transform to the plain text\r
+       plain = 0x80;\r
+       jj = (PLAIN_MAX_SIZE>>2)+1;\r
+\r
+       for(ii = 0; idx64 > 0xfffffff0ull && ii < PLAIN_MAX_SIZE; ii++) {\r
+               uiVal64 = idx64 + cplStart[ii];\r
+               uiVal64--;\r
+               dimItem = cplDimVec[ii];\r
+\r
+               uiMul64 = (uint64)dimItem.y<<32;\r
+               idx64 = __umul64hi(uiVal64, uiMul64);\r
+               uiDiv64 = uiVal64 - idx64*(uint64)dimItem.x;\r
+               uiVal = __umulhi((unsigned int)uiDiv64, dimItem.y);\r
+               uiDiv = (unsigned int)uiDiv64 - uiVal * dimItem.x;\r
+               idx64 += uiVal;\r
+               if(uiDiv >= dimItem.x) {\r
+                       uiDiv -= dimItem.x;\r
+                       idx64++;\r
+               }\r
+               plain = (plain<<8) | cplChrSet[dimItem.z + uiDiv];\r
+               if((ii&3) == 2) {\r
+                       hData[SHIDX(jj--)] = plain;\r
+                       plain = 0;\r
+               }\r
+       }\r
+\r
+       for(idx = (unsigned int)idx64; idx != 0 && ii < PLAIN_MAX_SIZE; ii++) {\r
+               uiVal = idx + cplStart[ii];\r
+               uiVal--;\r
+               dimItem = cplDimVec[ii];\r
+\r
+               idx = __umulhi(uiVal, dimItem.y);\r
+               uiDiv = uiVal - idx*dimItem.x;\r
+               if(uiDiv >= dimItem.x) {\r
+                       uiDiv -= dimItem.x;\r
+                       idx++;\r
+               }\r
+               plain = (plain<<8) | cplChrSet[dimItem.z + uiDiv];\r
+               if((ii&3) == 2) {\r
+                       hData[SHIDX(jj--)] = plain;\r
+                       plain = 0;\r
+               }\r
+       }\r
+\r
+       // prepare for MD5\r
+       size = ii;\r
+       ii = ((((3-(ii&3))<<3)-1)&0x1f)+1;\r
+       plain = plain<<ii;\r
+       for(jj++, idx = 0; jj <= (PLAIN_MAX_SIZE>>2)+1; plain = hData[SHIDX(jj++)], idx++)\r
+               hData[SHIDX(idx)] = (plain>>ii)|(hData[SHIDX(jj)]<<(32-ii));\r
+       hData[SHIDX(idx)] = plain>>ii;\r
+       for(idx++; idx < 14; idx++)\r
+               hData[SHIDX(idx)] = 0;\r
+       hData[SHIDX(idx++)] = size<<3;\r
+       hData[SHIDX(idx)] = 0;\r
+\r
+       // hash\r
+       RC_MD5::MD5(hData);\r
+\r
+       RTGEN_EPILOGUE\r
+}\r
diff --git a/BOINC software/BOINC client apps/distrrtgen_cuda/rcuda_ntlm.inc b/BOINC software/BOINC client apps/distrrtgen_cuda/rcuda_ntlm.inc
new file mode 100644 (file)
index 0000000..1c14c5b
--- /dev/null
@@ -0,0 +1,145 @@
+//============================================================================\r
+// Name        : rcuda_ntlm.inc\r
+// Author      : Jan Kyska\r
+// Version     : 1.00\r
+// Description : NTLM hash kernel for Generator of FreeRainbowTables\r
+//============================================================================ \r
+\r
+namespace RC_NTLM {\r
+\r
+__device__ __constant__ unsigned int h[4] = { 0x67452301, 0xEFCDAB89, 0x98BADCFE, 0x10325476 };\r
+__device__ __constant__ unsigned char r[48] = { \\r
+                                                       3,  7, 11, 19,  3,  7, 11, 19,  3,  7, 11, 19,  3,  7, 11, 19, \\r
+                                                       3,  5,  9, 13,  3,  5,  9, 13,  3,  5,  9, 13,  3,  5,  9, 13, \\r
+                                                       3,  9, 11, 15,  3,  9, 11, 15,  3,  9, 11, 15,  3,  9, 11, 15 };\r
+__device__ __constant__ unsigned char g[48] = { \\r
+                                                       0, 1,  2,  3,   4,  5,  6,  7,   8,  9, 10, 11,  12, 13, 14, 15, \\r
+                                                       0, 4,  8, 12,   1,  5,  9, 13,   2,  6, 10, 14,   3,  7, 11, 15, \\r
+                                                       0, 8,  4, 12,   2, 10,  6, 14,   1,  9,  5, 13,   3, 11,  7, 15 };\r
+\r
+__device__ unsigned int FF(unsigned int a, unsigned int b, unsigned int c, unsigned int d, int i, const unsigned int* data) {\r
+       unsigned int ret;\r
+       ret = a + ((b&c)|((~b)&d)) + data[SHIDX(g[i])];\r
+       ret = (ret<<r[i])|(ret>>(32-r[i]));\r
+       return ret;\r
+}\r
+\r
+__device__ unsigned int GG(unsigned int a, unsigned int b, unsigned int c, unsigned int d, int i, const unsigned int* data) {\r
+       unsigned int ret;\r
+       ret = a + ((b&c)|(b&d)|(c&d)) + data[SHIDX(g[i])] + 0x5a827999u;\r
+       ret = (ret<<r[i])|(ret>>(32-r[i]));\r
+       return ret;\r
+}\r
+\r
+__device__ unsigned int HH(unsigned int a, unsigned int b, unsigned int c, unsigned int d, int i, const unsigned int* data) {\r
+       unsigned int ret;\r
+       ret = a + (b^c^d) + data[SHIDX(g[i])] + 0x6ed9eba1u;\r
+       ret = (ret<<r[i])|(ret>>(32-r[i]));\r
+       return ret;\r
+}\r
+\r
+__device__ void MD4(unsigned int* dataHash) {\r
+       unsigned int a = h[0], b = h[1], c = h[2], d = h[3], x;\r
+       int ii;\r
+\r
+       // Round 1\r
+       for(ii = 0; ii < 16; ii++) {\r
+               x = b;\r
+               b = FF(a, b, c, d, ii, dataHash);\r
+               a = d; d = c; c = x;\r
+       }\r
+\r
+       // Round 2\r
+       for(; ii < 32; ii++) {\r
+               x = b;\r
+               b = GG(a, b, c, d, ii, dataHash);\r
+               a = d; d = c; c = x;\r
+       }\r
+       \r
+       // Round 3\r
+       for(; ii < 48; ii++) {\r
+               x = b;\r
+               b = HH(a, b, c, d, ii, dataHash);\r
+               a = d; d = c; c = x;\r
+       }\r
+       \r
+       dataHash[SHIDX(0)] = a + h[0];\r
+       dataHash[SHIDX(1)] = b + h[1];\r
+       dataHash[SHIDX(2)] = c + h[2];\r
+       dataHash[SHIDX(3)] = d + h[3];\r
+}\r
+\r
+}\r
+\r
+__global__ void RTGenNTLMKernel(unsigned int chainStart, unsigned int chainStop) {\r
+       uint3 dimItem;\r
+       uint64 uiDiv64, uiVal64, uiMul64;\r
+       unsigned int uiVal, uiDiv;\r
+       unsigned int size, jj, kk;\r
+       unsigned int plain;\r
+\r
+       RTGEN_PROLOGUE;\r
+       \r
+       // transform to the plain text\r
+       plain = 0x80;\r
+       jj = (PLAIN_MAX_SIZE>>1)+1;\r
+\r
+       for(ii = kk = 0; idx64 > 0xfffffff0ull && ii < PLAIN_MAX_SIZE; ii++) {\r
+               uiVal64 = idx64 + cplStart[ii];\r
+               uiVal64--;\r
+               dimItem = cplDimVec[ii];\r
+\r
+               uiMul64 = (uint64)dimItem.y<<32;\r
+               idx64 = __umul64hi(uiVal64, uiMul64);\r
+               uiDiv64 = uiVal64 - idx64*(uint64)dimItem.x;\r
+               uiVal = __umulhi((unsigned int)uiDiv64, dimItem.y);\r
+               uiDiv = (unsigned int)uiDiv64 - uiVal * dimItem.x;\r
+               idx64 += uiVal;\r
+               if(uiDiv >= dimItem.x) {\r
+                       uiDiv -= dimItem.x;\r
+                       idx64++;\r
+               }\r
+               plain = (plain<<8);\r
+               if((kk++)&1) {\r
+                       hData[SHIDX(jj--)] = plain;\r
+                       plain = 0;\r
+               }\r
+               plain = (plain<<8) | cplChrSet[dimItem.z + uiDiv];\r
+       }\r
+\r
+       for(idx = (unsigned int)idx64; idx != 0 && ii < PLAIN_MAX_SIZE; ii++) {\r
+               uiVal = idx + cplStart[ii];\r
+               uiVal--;\r
+               dimItem = cplDimVec[ii];\r
+\r
+               idx = __umulhi(uiVal, dimItem.y);\r
+               uiDiv = uiVal - idx*dimItem.x;\r
+               if(uiDiv >= dimItem.x) {\r
+                       uiDiv -= dimItem.x;\r
+                       idx++;\r
+               }\r
+               plain = (plain<<8);\r
+               if((kk++)&1) {\r
+                       hData[SHIDX(jj--)] = plain;\r
+                       plain = 0;\r
+               }\r
+               plain = (plain<<8) | cplChrSet[dimItem.z + uiDiv];\r
+       }\r
+\r
+       // prepare for MD4\r
+       size = (ii<<1);\r
+       ii = (((kk^1)&1)<<4)+8;\r
+       plain = plain<<ii;\r
+       for(jj++, idx = 0; jj <= (PLAIN_MAX_SIZE>>1)+1; plain = hData[SHIDX(jj++)], idx++)\r
+               hData[SHIDX(idx)] = (plain>>ii)|(hData[SHIDX(jj)]<<(32-ii));\r
+       hData[SHIDX(idx)] = plain>>ii;\r
+       for(idx++; idx < 14; idx++)\r
+               hData[SHIDX(idx)] = 0;\r
+       hData[SHIDX(idx++)] = size<<3;\r
+       hData[SHIDX(idx)] = 0;\r
+\r
+       // hash\r
+       RC_NTLM::MD4(hData);\r
+\r
+       RTGEN_EPILOGUE\r
+}\r
diff --git a/BOINC software/BOINC client apps/distrrtgen_cuda/rcuda_sha1.inc b/BOINC software/BOINC client apps/distrrtgen_cuda/rcuda_sha1.inc
new file mode 100644 (file)
index 0000000..af621d2
--- /dev/null
@@ -0,0 +1,160 @@
+//============================================================================\r
+// Name        : rcuda_sha1.inc\r
+// Author      : Jan Kyska\r
+// Version     : 1.00\r
+// Description : SHA1 hash kernel for Generator of FreeRainbowTables\r
+//============================================================================ \r
+\r
+namespace RC_SHA1 {\r
+\r
+__device__ __constant__ unsigned int h[5] = { 0x67452301, 0xEFCDAB89, 0x98BADCFE, 0x10325476, 0xC3D2E1F0 };\r
+\r
+__device__ unsigned int SwapEndian(unsigned int n) {\r
+       return (n<<24)|((n&0x0000ff00)<<8)|((n>>8)&0x0000ff00)|(n>>24);\r
+}\r
+\r
+__device__ unsigned int FF(unsigned int a, unsigned int b, unsigned int c, unsigned int d, unsigned int e, int i, unsigned int* data) {\r
+       return ((a<<5)|(a>>27)) + ((b&c)|((~b)&d)) + e + 0x5A827999 + data[SHIDX(i)];\r
+}\r
+\r
+__device__ unsigned int FF2(unsigned int a, unsigned int b, unsigned int c, unsigned int d, unsigned int e, int i, unsigned int* data) {\r
+       unsigned int dt;\r
+       dt = data[SHIDX(i&15)]^data[SHIDX((i-3)&15)]^data[SHIDX((i-8)&15)]^data[SHIDX((i-14)&15)];\r
+       data[SHIDX(i&15)] = dt = ((dt<<1)|(dt>>31));\r
+       return ((a<<5)|(a>>27)) + ((b&c)|((~b)&d)) + e + 0x5A827999 + dt;\r
+}\r
+\r
+__device__ unsigned int GG(unsigned int a, unsigned int b, unsigned int c, unsigned int d, unsigned int e, int i, unsigned int* data) {\r
+       unsigned int dt;\r
+       dt = data[SHIDX(i&15)]^data[SHIDX((i-3)&15)]^data[SHIDX((i-8)&15)]^data[SHIDX((i-14)&15)];\r
+       data[SHIDX(i&15)] = dt = ((dt<<1)|(dt>>31));\r
+       return ((a<<5)|(a>>27)) + (b^c^d) + e + 0x6ED9EBA1 + dt;\r
+}\r
+\r
+__device__ unsigned int HH(unsigned int a, unsigned int b, unsigned int c, unsigned int d, unsigned int e, int i, unsigned int* data) {\r
+       unsigned int dt;\r
+       dt = data[SHIDX(i&15)]^data[SHIDX((i-3)&15)]^data[SHIDX((i-8)&15)]^data[SHIDX((i-14)&15)];\r
+       data[SHIDX(i&15)] = dt = ((dt<<1)|(dt>>31));\r
+       return ((a<<5)|(a>>27)) + ((b&c)|(b&d)|(c&d)) + e + 0x8F1BBCDC + dt;\r
+}\r
+\r
+__device__ unsigned int II(unsigned int a, unsigned int b, unsigned int c, unsigned int d, unsigned int e, int i, unsigned int* data) {\r
+       unsigned int dt;\r
+       dt = data[SHIDX(i&15)]^data[SHIDX((i-3)&15)]^data[SHIDX((i-8)&15)]^data[SHIDX((i-14)&15)];\r
+       data[SHIDX(i&15)] = dt = ((dt<<1)|(dt>>31));\r
+       return ((a<<5)|(a>>27)) + (b^c^d) + e + 0xCA62C1D6 + dt;\r
+}\r
+\r
+__device__ void SHA1(unsigned int* dataHash) {\r
+       unsigned int a = h[0], b = h[1], c = h[2], d = h[3], e = h[4], x;\r
+       int ii;\r
+\r
+       // Round 1\r
+       for(ii = 0; ii < 16; ii++) {\r
+               x = FF(a, b, c, d, e, ii, dataHash);\r
+               e = d; d = c; c = ((b<<30)|(b>>2)); b = a; a = x;\r
+       }\r
+       for(; ii < 20; ii++) {\r
+               x = FF2(a, b, c, d, e, ii, dataHash);\r
+               e = d; d = c; c = ((b<<30)|(b>>2)); b = a; a = x;\r
+       }\r
+       \r
+       // Round 2\r
+       for(; ii < 40; ii++) {\r
+               x = GG(a, b, c, d, e, ii, dataHash);\r
+               e = d; d = c; c = ((b<<30)|(b>>2)); b = a; a = x;\r
+       }\r
+\r
+       // Round 3\r
+       for(; ii < 60; ii++) {\r
+               x = HH(a, b, c, d, e, ii, dataHash);\r
+               e = d; d = c; c = ((b<<30)|(b>>2)); b = a; a = x;\r
+       }\r
+       \r
+       // Round 4\r
+       for(; ii < 80; ii++) {\r
+               x = II(a, b, c, d, e, ii, dataHash);\r
+               e = d; d = c; c = ((b<<30)|(b>>2)); b = a; a = x;\r
+       }\r
+\r
+       dataHash[SHIDX(0)] = a + h[0];\r
+       dataHash[SHIDX(1)] = b + h[1];\r
+       dataHash[SHIDX(2)] = c + h[2];\r
+       dataHash[SHIDX(3)] = d + h[3];\r
+       dataHash[SHIDX(4)] = e + h[4];\r
+}\r
+\r
+}\r
+\r
+__global__ void RTGenSHA1Kernel(unsigned int chainStart, unsigned int chainStop) {\r
+       uint3 dimItem;\r
+       uint64 uiDiv64, uiVal64, uiMul64;\r
+       unsigned int uiVal, uiDiv;\r
+       unsigned int size, jj;\r
+       unsigned int plain;\r
+\r
+       RTGEN_PROLOGUE;\r
+       \r
+       // transform to the plain text\r
+       plain = 0x80;\r
+       jj = (PLAIN_MAX_SIZE>>2)+1;\r
+\r
+       for(ii = 0; idx64 > 0xfffffff0ull && ii < PLAIN_MAX_SIZE; ii++) {\r
+               uiVal64 = idx64 + cplStart[ii];\r
+               uiVal64--;\r
+               dimItem = cplDimVec[ii];\r
+\r
+               uiMul64 = (uint64)dimItem.y<<32;\r
+               idx64 = __umul64hi(uiVal64, uiMul64);\r
+               uiDiv64 = uiVal64 - idx64*(uint64)dimItem.x;\r
+               uiVal = __umulhi((unsigned int)uiDiv64, dimItem.y);\r
+               uiDiv = (unsigned int)uiDiv64 - uiVal * dimItem.x;\r
+               idx64 += uiVal;\r
+               if(uiDiv >= dimItem.x) {\r
+                       uiDiv -= dimItem.x;\r
+                       idx64++;\r
+               }\r
+               plain = (plain<<8) | cplChrSet[dimItem.z + uiDiv];\r
+               if((ii&3) == 2) {\r
+                       hData[SHIDX(jj--)] = plain;\r
+                       plain = 0;\r
+               }\r
+       }\r
+\r
+       for(idx = (unsigned int)idx64; idx != 0 && ii < PLAIN_MAX_SIZE; ii++) {\r
+               uiVal = idx + cplStart[ii];\r
+               uiVal--;\r
+               dimItem = cplDimVec[ii];\r
+\r
+               idx = __umulhi(uiVal, dimItem.y);\r
+               uiDiv = uiVal - idx*dimItem.x;\r
+               if(uiDiv >= dimItem.x) {\r
+                       uiDiv -= dimItem.x;\r
+                       idx++;\r
+               }\r
+               plain = (plain<<8) | cplChrSet[dimItem.z + uiDiv];\r
+               if((ii&3) == 2) {\r
+                       hData[SHIDX(jj--)] = plain;\r
+                       plain = 0;\r
+               }\r
+       }\r
+\r
+       // prepare for SHA1\r
+       size = ii;\r
+       ii = ((((3-(ii&3))<<3)-1)&0x1f)+1;\r
+       plain = plain<<ii;\r
+       for(jj++, idx = 0; jj <= (PLAIN_MAX_SIZE>>2)+1; plain = hData[SHIDX(jj++)], idx++)\r
+               hData[SHIDX(idx)] = RC_SHA1::SwapEndian((plain>>ii)|(hData[SHIDX(jj)]<<(32-ii)));\r
+       hData[SHIDX(idx)] = RC_SHA1::SwapEndian(plain>>ii);\r
+       for(idx++; idx < 14; idx++)\r
+               hData[SHIDX(idx)] = 0;\r
+       hData[SHIDX(idx++)] = 0;\r
+       hData[SHIDX(idx)] = size<<3;\r
+\r
+       // hash\r
+       RC_SHA1::SHA1(hData);\r
+       \r
+       hData[SHIDX(0)] = RC_SHA1::SwapEndian(hData[SHIDX(0)]);\r
+       hData[SHIDX(1)] = RC_SHA1::SwapEndian(hData[SHIDX(1)]);\r
+       RTGEN_EPILOGUE\r
+}\r