\r
#include "simd.h"\r
\r
-#include <tbb/parallel_for.h>\r
-\r
#include <intrin.h>\r
#include <stdint.h>\r
\r
const xmm_epi16 round = 128;\r
const xmm_epi16 lomask = 0x00FF;\r
\r
- tbb::parallel_for(tbb::blocked_range<size_t>(0, count/sizeof(xmm_epi8)), [&](const tbb::blocked_range<size_t>& r)\r
+ for(auto n = 0; n < count; n += 16) \r
{\r
- for(auto n = r.begin(); n != r.end(); ++n) \r
- {\r
- auto s = xmm_epi16::load(source1+n*16);\r
- auto d = xmm_epi16::load(source2+n*16);\r
+ auto s = xmm_epi16::load(source1+n);\r
+ auto d = xmm_epi16::load(source2+n);\r
\r
- // T(S, D) = S * D[A] + 0x80\r
- auto xxxa = xmm_cast<xmm_epi32>(d) >> 24;\r
- auto xaxa = xmm_cast<xmm_epi16>((xxxa << 16) | xxxa);\r
+ // T(S, D) = S * D[A] + 0x80\r
+ auto xxxa = xmm_cast<xmm_epi32>(d) >> 24;\r
+ auto xaxa = xmm_cast<xmm_epi16>((xxxa << 16) | xxxa);\r
\r
- auto xbxr = s & lomask;\r
- auto t1 = xmm_epi16::multiply_low(xbxr, xaxa) + round; \r
+ auto xbxr = s & lomask;\r
+ auto t1 = xmm_epi16::multiply_low(xbxr, xaxa) + round; \r
\r
- auto xaxg = s >> 8;\r
- auto t2 = xmm_epi16::multiply_low(xaxg, xaxa) + round;\r
+ auto xaxg = s >> 8;\r
+ auto t2 = xmm_epi16::multiply_low(xaxg, xaxa) + round;\r
\r
- // C(S, D) = S + D - (((T >> 8) + T) >> 8);\r
- auto bxrx = (t1 >> 8) + t1; \r
- auto axgx = (t2 >> 8) + t2; \r
- auto bgra = xmm_cast<xmm_epi8>((bxrx >> 8) | xmm_epi16::and_not(axgx, lomask));\r
+ // C(S, D) = S + D - (((T >> 8) + T) >> 8);\r
+ auto bxrx = (t1 >> 8) + t1; \r
+ auto axgx = (t2 >> 8) + t2; \r
+ auto bgra = xmm_cast<xmm_epi8>((bxrx >> 8) | xmm_epi16::and_not(axgx, lomask));\r
\r
- xmm_epi8::stream(xmm_cast<xmm_epi8>(s) + (xmm_cast<xmm_epi8>(d) - bgra), dest + n*16);\r
- } \r
- });\r
+ xmm_epi8::stream(xmm_cast<xmm_epi8>(s) + (xmm_cast<xmm_epi8>(d) - bgra), dest+n);\r
+ } \r
}\r
\r
}}}
\ No newline at end of file